[viennacl] 02/09: Merge tag 'upstream/1.5.1'

Wed Feb 19 19:09:55 UTC 2014

This is an automated email from the git hooks/post-receive script.

tsmithe-guest pushed a commit to branch master
in repository viennacl.

commit 18f2568a0b2df033e0f4ddc58b3e642866bce929
Merge: ac889ed d19020e
Author: Toby Smithe <git at tsmithe.net>
Date:   Wed Feb 19 16:11:14 2014 +0000

    Merge tag 'upstream/1.5.1'
    
    Upstream version 1.5.1
    
    Conflicts:
    	CMakeLists.txt
    	README
    	auxiliary/CMakeLists.txt
    	auxiliary/converter.cpp
    	auxiliary/generate-blas3-prod-align1.cpp
    	auxiliary/matrix_col/align1/add.cl
    	auxiliary/matrix_col/align1/assign.cl
    	auxiliary/matrix_col/align1/clear.cl
    	auxiliary/matrix_col/align1/cpu_inplace_mult.cl
    	auxiliary/matrix_col/align1/inplace_add.cl
    	auxiliary/matrix_col/align1/inplace_divide.cl
    	auxiliary/matrix_col/align1/inplace_mult.cl
    	auxiliary/matrix_col/align1/inplace_sub.cl
    	auxiliary/matrix_col/align1/sub.cl
    	auxiliary/matrix_col/align1/trans_vec_mul.cl
    	auxiliary/matrix_col/align1/vec_mul.cl
    	auxiliary/matrix_row/align1/add.cl
    	auxiliary/matrix_row/align1/assign.cl
    	auxiliary/matrix_row/align1/clear.cl
    	auxiliary/matrix_row/align1/cpu_inplace_mult.cl
    	auxiliary/matrix_row/align1/inplace_add.cl
    	auxiliary/matrix_row/align1/inplace_divide.cl
    	auxiliary/matrix_row/align1/inplace_mult.cl
    	auxiliary/matrix_row/align1/inplace_sub.cl
    	auxiliary/matrix_row/align1/sub.cl
    	auxiliary/matrix_row/align1/trans_vec_mul.cl
    	auxiliary/matrix_row/align1/vec_mul.cl
    	auxiliary/vector/align1/add.cl
    	auxiliary/vector/align1/clear.cl
    	auxiliary/vector/align1/cpu_inplace_mul_add.cl
    	auxiliary/vector/align1/cpu_inplace_mult.cl
    	auxiliary/vector/align1/cpu_mul_add.cl
    	auxiliary/vector/align1/cpu_mult.cl
    	auxiliary/vector/align1/diag_precond.cl
    	auxiliary/vector/align1/divide.cl
    	auxiliary/vector/align1/index_norm_inf.cl
    	auxiliary/vector/align1/inner_prod.cl
    	auxiliary/vector/align1/inplace_add.cl
    	auxiliary/vector/align1/inplace_div_add.cl
    	auxiliary/vector/align1/inplace_div_sub.cl
    	auxiliary/vector/align1/inplace_divide.cl
    	auxiliary/vector/align1/inplace_mul_add.cl
    	auxiliary/vector/align1/inplace_mul_sub.cl
    	auxiliary/vector/align1/inplace_mult.cl
    	auxiliary/vector/align1/inplace_sub.cl
    	auxiliary/vector/align1/mul_add.cl
    	auxiliary/vector/align1/mul_sub.cl
    	auxiliary/vector/align1/mult.cl
    	auxiliary/vector/align1/norm_1.cl
    	auxiliary/vector/align1/norm_2.cl
    	auxiliary/vector/align1/norm_inf.cl
    	auxiliary/vector/align1/plane_rotation.cl
    	auxiliary/vector/align1/sqrt_sum.cl
    	auxiliary/vector/align1/sub.cl
    	auxiliary/vector/align1/sum.cl
    	auxiliary/vector/align1/swap.cl
    	auxiliary/vector/align1/vmax.cl
    	auxiliary/vector/align16/add.cl
    	auxiliary/vector/align16/cpu_inplace_mul.cl
    	auxiliary/vector/align16/cpu_mult.cl
    	auxiliary/vector/align16/divide.cl
    	auxiliary/vector/align16/inplace_add.cl
    	auxiliary/vector/align16/inplace_divide.cl
    	auxiliary/vector/align16/inplace_mult.cl
    	auxiliary/vector/align16/inplace_sub.cl
    	auxiliary/vector/align16/mult.cl
    	auxiliary/vector/align16/sub.cl
    	auxiliary/vector/align4/cpu_inplace_mul_add.cl
    	auxiliary/vector/align4/cpu_mul_add.cl
    	auxiliary/vector/align4/inplace_div_add.cl
    	auxiliary/vector/align4/inplace_div_sub.cl
    	auxiliary/vector/align4/inplace_mul_add.cl
    	auxiliary/vector/align4/inplace_mul_sub.cl
    	auxiliary/vector/align4/mul_add.cl
    	changelog
    	doc/Doxyfile.in
    	doc/manual/algorithms.tex
    	doc/manual/contributors.tex
    	doc/manual/cover.tex
    	doc/manual/kernel-generation.tex
    	doc/manual/multi-device.tex
    	doc/manual/types.tex
    	doc/manual/viennacl.bib
    	doc/manual/viennacl.tex
    	examples/benchmarks/CMakeLists.txt
    	examples/benchmarks/blas3.cpp
    	examples/benchmarks/solver.cpp
    	examples/benchmarks/sparse.cpp
    	examples/tutorial/CMakeLists.txt
    	examples/tutorial/iterative-ublas.cpp
    	examples/tutorial/iterative.cpp
    	examples/tutorial/lanczos.cpp
    	examples/tutorial/power-iter.cpp
    	examples/tutorial/qr.cpp
    	tests/CMakeLists.txt
    	tests/src/blas3_solve_double.cpp
    	tests/src/external_1.cpp
    	tests/src/external_2.cpp
    	tests/src/matrix.cpp
    	tests/src/matrix_range.cpp
    	tests/src/nmf.cpp
    	tests/src/sparse.cpp
    	tests/src/svd.cpp
    	tests/src/vector_range.cpp
    	viennacl/ell_matrix.hpp
    	viennacl/forwards.h
    	viennacl/generator/forwards.h
    	viennacl/hyb_matrix.hpp
    	viennacl/linalg/bicgstab.hpp
    	viennacl/linalg/bisect.hpp
    	viennacl/linalg/cg.hpp
    	viennacl/linalg/coordinate_matrix_operations.hpp
    	viennacl/linalg/detail/ilu/block_ilu.hpp
    	viennacl/linalg/detail/ilu/common.hpp
    	viennacl/linalg/detail/ilu/ilu0.hpp
    	viennacl/linalg/detail/ilu/ilut.hpp
    	viennacl/linalg/eig.hpp
    	viennacl/linalg/gmres.hpp
    	viennacl/linalg/ilu.hpp
    	viennacl/linalg/inner_prod.hpp
    	viennacl/linalg/jacobi_precond.hpp
    	viennacl/linalg/lanczos.hpp
    	viennacl/linalg/matrix_operations.hpp
    	viennacl/linalg/nmf.hpp
    	viennacl/linalg/norm_1.hpp
    	viennacl/linalg/norm_2.hpp
    	viennacl/linalg/norm_inf.hpp
    	viennacl/linalg/power_iter.hpp
    	viennacl/linalg/prod.hpp
    	viennacl/linalg/qr.hpp
    	viennacl/linalg/row_scaling.hpp
    	viennacl/linalg/svd.hpp
    	viennacl/linalg/vector_operations.hpp
    	viennacl/matrix.hpp
    	viennacl/matrix_proxy.hpp
    	viennacl/meta/predicate.hpp
    	viennacl/meta/result_of.hpp
    	viennacl/meta/tag_of.hpp
    	viennacl/ocl/backend.hpp
    	viennacl/ocl/context.hpp
    	viennacl/ocl/enqueue.hpp
    	viennacl/ocl/kernel.hpp
    	viennacl/ocl/platform.hpp
    	viennacl/slice.hpp
    	viennacl/toeplitz_matrix.hpp
    	viennacl/tools/adapter.hpp
    	viennacl/tools/matrix_kernel_class_deducer.hpp
    	viennacl/tools/matrix_prod_kernel_class_deducer.hpp
    	viennacl/tools/matrix_size_deducer.hpp
    	viennacl/tools/tools.hpp
    	viennacl/traits/handle.hpp
    	viennacl/traits/size.hpp
    	viennacl/traits/start.hpp
    	viennacl/traits/stride.hpp
    	viennacl/vector.hpp
    	viennacl/vector_proxy.hpp

 CL/cl_gl_ext.h                                     |    2 +-
 CMakeLists.txt                                     |   13 +-
 LICENSE                                            |    7 +-
 README                                             |   53 +-
 auxiliary/README                                   |   17 -
 .../compressed_matrix/align1/bicgstab_kernel1.cl   |   54 -
 .../compressed_matrix/align1/bicgstab_kernel2.cl   |   81 -
 auxiliary/compressed_matrix/align1/jacobi.cl       |   28 -
 .../compressed_matrix/align1/jacobi_precond.cl     |   26 -
 auxiliary/compressed_matrix/align1/lu_backward.cl  |  115 -
 auxiliary/compressed_matrix/align1/lu_forward.cl   |  107 -
 .../compressed_matrix/align1/row_scaling_1.cl      |   20 -
 .../compressed_matrix/align1/row_scaling_2.cl      |   24 -
 auxiliary/compressed_matrix/align1/vec_mul.cl      |   21 -
 auxiliary/compressed_matrix/align4/vec_mul.cl      |   37 -
 auxiliary/compressed_matrix/align8/vec_mul.cl      |   42 -
 auxiliary/compressed_matrix/matrix.old_cl          |  226 -
 auxiliary/coordinate_matrix/align1/vec_mul.cl      |  126 -
 auxiliary/coordinate_matrix/align128/dummy         |    1 -
 auxiliary/coordinate_matrix/matrix.old_cl          |  822 --
 auxiliary/fft/align1/bluestein_post.cl             |   23 -
 auxiliary/fft/align1/bluestein_pre.cl              |   34 -
 auxiliary/fft/align1/complex_to_real.cl            |    8 -
 auxiliary/fft/align1/fft_div_vec_scalar.cl         |    7 -
 auxiliary/fft/align1/fft_mult_vec.cl               |   13 -
 auxiliary/fft/align1/real_to_complex.cl            |   11 -
 auxiliary/fft/align1/reverse_inplace.cl            |   11 -
 auxiliary/fft/align1/transpose.cl                  |   16 -
 auxiliary/fft/align1/transpose_inplace.cl          |   22 -
 auxiliary/fft/align1/vandermonde_prod.cl           |   19 -
 auxiliary/fft/align1/zero2.cl                      |   11 -
 auxiliary/generate-blas3-solve-align1.cpp          |  183 -
 auxiliary/matrix_col/align1/fft_direct.cl          |   29 -
 auxiliary/matrix_col/align1/fft_radix2.cl          |   39 -
 auxiliary/matrix_col/align1/fft_radix2_local.cl    |   74 -
 auxiliary/matrix_col/align1/fft_reorder.cl         |   38 -
 .../align1/lower_triangular_substitute_inplace.cl  |   26 -
 auxiliary/matrix_col/align1/lu_factorize.cl        |   27 -
 auxiliary/matrix_col/align1/rank1_update.cl        |   21 -
 auxiliary/matrix_col/align1/scaled_rank1_update.cl |   22 -
 .../trans_lower_triangular_substitute_inplace.cl   |   26 -
 ...ans_unit_lower_triangular_substitute_inplace.cl |   22 -
 ...ans_unit_upper_triangular_substitute_inplace.cl |   24 -
 .../trans_upper_triangular_substitute_inplace.cl   |   28 -
 .../unit_lower_triangular_substitute_inplace.cl    |   22 -
 .../unit_upper_triangular_substitute_inplace.cl    |   23 -
 .../align1/upper_triangular_substitute_inplace.cl  |   27 -
 auxiliary/matrix_col/align16/dummy                 |    1 -
 auxiliary/matrix_col/matrix.old_cl                 |  120 -
 auxiliary/matrix_row/align1/fft_direct.cl          |   32 -
 auxiliary/matrix_row/align1/fft_radix2.cl          |   46 -
 auxiliary/matrix_row/align1/fft_radix2_local.cl    |   72 -
 auxiliary/matrix_row/align1/fft_reorder.cl         |   42 -
 .../align1/lower_triangular_substitute_inplace.cl  |   26 -
 auxiliary/matrix_row/align1/lu_factorize.cl        |   31 -
 auxiliary/matrix_row/align1/rank1_update.cl        |   23 -
 auxiliary/matrix_row/align1/scaled_rank1_update.cl |   24 -
 .../trans_lower_triangular_substitute_inplace.cl   |   26 -
 ...ans_unit_lower_triangular_substitute_inplace.cl |   22 -
 ...ans_unit_upper_triangular_substitute_inplace.cl |   24 -
 .../trans_upper_triangular_substitute_inplace.cl   |   28 -
 .../unit_lower_triangular_substitute_inplace.cl    |   22 -
 .../unit_upper_triangular_substitute_inplace.cl    |   23 -
 .../align1/upper_triangular_substitute_inplace.cl  |   27 -
 auxiliary/matrix_row/align16/dummy                 |    1 -
 auxiliary/matrix_row/matrix.old_cl                 |  120 -
 auxiliary/scalar/align1/add.cl                     |   10 -
 auxiliary/scalar/align1/cpu_add.cl                 |   10 -
 auxiliary/scalar/align1/cpu_div.cl                 |   10 -
 auxiliary/scalar/align1/cpu_inplace_add.cl         |    9 -
 auxiliary/scalar/align1/cpu_inplace_div.cl         |   10 -
 auxiliary/scalar/align1/cpu_inplace_mul.cl         |    9 -
 auxiliary/scalar/align1/cpu_inplace_sub.cl         |   10 -
 auxiliary/scalar/align1/cpu_mul.cl                 |   10 -
 auxiliary/scalar/align1/cpu_sub.cl                 |   10 -
 auxiliary/scalar/align1/divide.cl                  |   12 -
 auxiliary/scalar/align1/inplace_add.cl             |    9 -
 auxiliary/scalar/align1/inplace_div.cl             |    9 -
 auxiliary/scalar/align1/inplace_mul.cl             |    9 -
 auxiliary/scalar/align1/inplace_sub.cl             |    9 -
 auxiliary/scalar/align1/mul.cl                     |   10 -
 auxiliary/scalar/align1/sub.cl                     |   10 -
 auxiliary/spai/align1/assemble_blocks.cl           |   60 -
 auxiliary/spai/align1/block_bv_assembly.cl         |   33 -
 auxiliary/spai/align1/block_least_squares.cl       |   68 -
 auxiliary/spai/align1/block_q_mult.cl              |   74 -
 auxiliary/spai/align1/block_qr.cl                  |  130 -
 auxiliary/spai/align1/block_qr_assembly.cl         |   57 -
 auxiliary/spai/align1/block_qr_assembly_1.cl       |   36 -
 auxiliary/spai/align1/block_r_assembly.cl          |   68 -
 auxiliary/vector/align4/inner_prod.cl_disabled     |   40 -
 auxiliary/vector/align4/norm_2.cl_disabled         |   47 -
 changelog                                          |  142 +-
 cmake/FindMTL.cmake                                |   11 +-
 cmake/FindOpenCL.cmake                             |    6 +
 cmake/ViennaCLCommon.cmake                         |   59 +-
 doc/CMakeLists.txt                                 |    3 +
 doc/Doxyfile.in                                    | 1314 +--
 doc/manual/additional-algorithms.tex               |  221 +
 doc/manual/algorithms.tex                          |  307 +-
 doc/manual/benchmarks.tex                          |    2 +-
 doc/manual/changelogs.tex                          |  192 +-
 doc/manual/contributors.tex                        |   21 +-
 doc/manual/cover.tex                               |   12 +-
 doc/manual/custom-contexts.tex                     |   13 +-
 doc/manual/custom-kernels.tex                      |   30 +-
 doc/manual/design.tex                              |   11 +-
 doc/manual/figures/TU_Signet_CMYK.eps              |   10 +-
 doc/manual/installation.tex                        |  206 +-
 doc/manual/introduction.tex                        |   51 +-
 doc/manual/kernel-generation.tex                   |   39 +-
 doc/manual/keywords.tex                            |    4 +-
 doc/manual/license.tex                             |   10 +-
 doc/manual/memory-model.tex                        |   46 +
 doc/manual/multi-device.tex                        |   29 +-
 doc/manual/operations.tex                          |  202 +-
 doc/manual/other-libs.tex                          |   36 +-
 doc/manual/setup.tex                               |    2 +-
 doc/manual/shared-lib.tex                          |   14 +
 doc/manual/structured-matrices.tex                 |   98 +
 doc/manual/tuning.tex                              |    4 +-
 doc/manual/types.tex                               |  250 +-
 doc/manual/versioning.tex                          |    2 +-
 doc/manual/viennacl.bib                            |   27 +-
 doc/manual/viennacl.tex                            |   79 +-
 examples/CMakeLists.txt                            |    4 +-
 examples/autotuner/CMakeLists.txt                  |   12 +
 examples/autotuner/command-line-utils.hpp          |   55 +
 examples/autotuner/dot_autotuning.cpp              |  270 +
 examples/autotuner/dump_default_kernels.cpp        |   81 +
 examples/autotuner/gemm_autotuning.cpp             |  382 +
 examples/autotuner/gemv_autotuning.cpp             |  266 +
 examples/autotuner/vector-axpy_autotuning.cpp      |  270 +
 examples/benchmarks/CMakeLists.txt                 |   58 +-
 examples/benchmarks/benchmark-utils.hpp            |  197 +-
 examples/benchmarks/blas3.cpp                      |  249 +
 examples/benchmarks/{blas3.cpp => blas3.cu}        |  446 +-
 examples/benchmarks/copy.cpp                       |  189 +
 examples/benchmarks/copy.cu                        |  189 +
 examples/benchmarks/generator_blas1.cpp            |  135 +
 examples/benchmarks/generator_blas2.cpp            |  127 +
 examples/benchmarks/generator_blas3.cpp            |  129 +
 examples/benchmarks/io.hpp                         |  229 +-
 examples/benchmarks/opencl.cpp                     |  288 +-
 examples/benchmarks/qr.cpp                         |   19 +-
 examples/benchmarks/{qr.cpp => qr.cu}              |   19 +-
 examples/benchmarks/{opencl.cpp => scheduler.cpp}  |  293 +-
 examples/benchmarks/solver.cpp                     |  648 ++
 examples/benchmarks/{solver.cpp => solver.cu}      | 1096 ++-
 examples/benchmarks/sparse.cpp                     |  325 +
 examples/benchmarks/{sparse.cpp => sparse.cu}      |  625 +-
 examples/benchmarks/vector.cpp                     |  522 +-
 examples/benchmarks/{vector.cpp => vector.cu}      |  522 +-
 examples/parameters/CMakeLists.txt                 |   25 -
 examples/parameters/benchmark-utils.hpp            |   98 -
 examples/parameters/common.hpp                     |  251 -
 examples/parameters/common_vprof.hpp               |   53 -
 examples/parameters/matrix.cpp                     |  267 -
 examples/parameters/matrix_functors.hpp            |   91 -
 examples/parameters/parameter_reader.cpp           |   65 -
 examples/parameters/sparse.cpp                     |  245 -
 examples/parameters/vector.cpp                     |  254 -
 examples/parameters/vector_functors.hpp            |  204 -
 examples/testdata/eigen/nsm1.example               |    6 +
 examples/testdata/eigen/nsm2.example               |   12 +
 examples/testdata/eigen/nsm3.example               |  273 +
 examples/testdata/eigen/nsm4.example               |  921 ++
 examples/testdata/eigen/symm1.example              |    6 +
 examples/testdata/eigen/symm2.example              |   12 +
 examples/testdata/eigen/symm3.example              |  922 ++
 examples/tutorial/CMakeLists.txt                   |  111 +-
 examples/tutorial/Random.hpp                       |  104 +-
 examples/tutorial/amg.cpp                          |  120 +-
 examples/tutorial/bandwidth-reduction.cpp          |  109 +-
 examples/tutorial/blas1.cpp                        |  421 +-
 examples/tutorial/{blas1.cpp => blas1.cu}          |  421 +-
 examples/tutorial/blas2.cpp                        |  521 +-
 examples/tutorial/{blas2.cpp => blas2.cu}          |  521 +-
 examples/tutorial/blas3.cpp                        |  378 +-
 examples/tutorial/{blas3.cpp => blas3.cu}          |  378 +-
 examples/tutorial/blas3range.cpp                   |  424 +-
 .../tutorial/{blas3range.cpp => blas3range.cu}     |  424 +-
 examples/tutorial/custom-context.cpp               |  562 +-
 examples/tutorial/custom-kernels.cpp               |  276 +-
 examples/tutorial/eigen-with-viennacl.cpp          |   27 +-
 examples/tutorial/fft.cpp                          |   40 +-
 examples/tutorial/iterative-eigen.cpp              |   48 +-
 examples/tutorial/iterative-mtl4.cpp               |   69 +-
 examples/tutorial/iterative-ublas.cpp              |  159 +
 examples/tutorial/iterative.cpp                    |  238 +
 examples/tutorial/{iterative.cpp => iterative.cu}  |  479 +-
 examples/tutorial/lanczos.cpp                      |   48 +
 examples/tutorial/{lanczos.cpp => lanczos.cu}      |   31 +-
 examples/tutorial/least-squares.cpp                |  144 +
 examples/tutorial/least-squares.cu                 |  144 +
 examples/tutorial/libviennacl.cpp                  |  105 +
 examples/tutorial/libviennacl.cu                   |  105 +
 examples/tutorial/matrix-range.cpp                 |   56 +-
 .../tutorial/{matrix-range.cpp => matrix-range.cu} |   56 +-
 examples/tutorial/mtl4-with-viennacl.cpp           |   13 +-
 examples/tutorial/multithreaded.cpp                |  127 +
 examples/tutorial/multithreaded_cg.cpp             |  185 +
 examples/tutorial/power-iter.cpp                   |   46 +
 .../tutorial/{power-iter.cpp => power-iter.cu}     |   34 +-
 examples/tutorial/qr.cpp                           |   67 +-
 examples/tutorial/{qr.cpp => qr.cu}                |   63 +-
 examples/tutorial/rand.cpp                         |   70 +
 examples/tutorial/scheduler.cpp                    |  130 +
 examples/tutorial/spai.cpp                         |  102 +-
 examples/tutorial/sparse.cpp                       |  235 +-
 examples/tutorial/{sparse.cpp => sparse.cu}        |  235 +-
 examples/tutorial/structured-matrices.cpp          |   36 +-
 examples/tutorial/vector-io.hpp                    |  345 +-
 examples/tutorial/vector-range.cpp                 |   55 +-
 .../tutorial/{vector-range.cpp => vector-range.cu} |   55 +-
 examples/tutorial/viennacl-info.cpp                |   92 +-
 examples/tutorial/wrap-cuda-buffer.cu              |  121 +
 examples/tutorial/wrap-host-buffer.cpp             |   86 +
 external/pugixml/src/pugiconfig.hpp                |   62 -
 external/pugixml/src/pugixml.cpp                   | 9576 --------------------
 external/pugixml/src/pugixml.hpp                   | 1131 ---
 external/tclap/Arg.h                               |  692 ++
 external/tclap/ArgException.h                      |  200 +
 external/tclap/ArgTraits.h                         |   87 +
 external/tclap/COPYING                             |   25 +
 external/tclap/CmdLine.h                           |  633 ++
 external/tclap/CmdLineInterface.h                  |  150 +
 external/tclap/CmdLineOutput.h                     |   74 +
 external/tclap/Constraint.h                        |   68 +
 external/tclap/DocBookOutput.h                     |  299 +
 external/tclap/HelpVisitor.h                       |   76 +
 external/tclap/IgnoreRestVisitor.h                 |   52 +
 external/tclap/MultiArg.h                          |  433 +
 external/tclap/MultiSwitchArg.h                    |  216 +
 external/tclap/OptionalUnlabeledTracker.h          |   62 +
 external/tclap/StandardTraits.h                    |  208 +
 external/tclap/StdOutput.h                         |  298 +
 external/tclap/SwitchArg.h                         |  266 +
 external/tclap/UnlabeledMultiArg.h                 |  301 +
 external/tclap/UnlabeledValueArg.h                 |  340 +
 external/tclap/ValueArg.h                          |  425 +
 external/tclap/ValuesConstraint.h                  |  148 +
 external/tclap/VersionVisitor.h                    |   81 +
 external/tclap/Visitor.h                           |   53 +
 external/tclap/XorHandler.h                        |  166 +
 external/tclap/ZshCompletionOutput.h               |  323 +
 libviennacl/CMakeLists.txt                         |   35 +
 libviennacl/include/viennacl.hpp                   |  607 ++
 libviennacl/src/backend.cpp                        |   46 +
 libviennacl/src/backend.cu                         |   46 +
 libviennacl/src/blas1.cpp                          |  402 +
 libviennacl/src/blas1.cu                           |  402 +
 libviennacl/src/blas1_cuda.cu                      |  264 +
 libviennacl/src/blas1_host.cpp                     |  257 +
 libviennacl/src/blas1_host.cu                      |  257 +
 libviennacl/src/blas1_opencl.cpp                   |  261 +
 libviennacl/src/blas1_opencl.cu                    |  261 +
 libviennacl/src/blas2.cpp                          |  309 +
 libviennacl/src/blas2.cu                           |  309 +
 libviennacl/src/blas2_cuda.cu                      |  286 +
 libviennacl/src/blas2_host.cpp                     |  283 +
 libviennacl/src/blas2_host.cu                      |  283 +
 libviennacl/src/blas2_opencl.cpp                   |  283 +
 libviennacl/src/blas2_opencl.cu                    |  283 +
 libviennacl/src/blas3.cpp                          |  970 ++
 libviennacl/src/blas3.cu                           |  970 ++
 libviennacl/src/blas3.hpp                          |   60 +
 libviennacl/src/blas3_cuda.cu                      |  249 +
 libviennacl/src/blas3_host.cpp                     |  243 +
 libviennacl/src/blas3_host.cu                      |  243 +
 libviennacl/src/blas3_opencl.cpp                   |  249 +
 libviennacl/src/blas3_opencl.cu                    |  249 +
 libviennacl/src/init_matrix.hpp                    |  101 +
 libviennacl/src/init_vector.hpp                    |  101 +
 libviennacl/src/viennacl_private.hpp               |  141 +
 tests/CMakeLists.txt                               |  122 +-
 {examples/tutorial => tests/src}/Random.hpp        |  105 +-
 tests/src/blas3_prod_double.cpp                    |   65 +
 tests/src/blas3_prod_double.cu                     |   65 +
 tests/src/blas3_prod_float.cpp                     |   61 +
 tests/src/blas3_prod_float.cu                      |   61 +
 tests/src/blas3_prod_float_double.hpp              |  855 ++
 tests/src/{blas3.cpp => blas3_solve_double.cpp}    |  408 +-
 .../src/{blas3range.cpp => blas3_solve_double.cu}  |  519 +-
 tests/src/{blas3.cpp => blas3_solve_float.cpp}     |  515 +-
 tests/src/{blas3.cpp => blas3_solve_float.cu}      |  515 +-
 tests/src/blas3_solve_float_double.hpp             |  514 ++
 tests/src/external_1.cpp                           |   44 +-
 tests/src/{external_1.cpp => external_1.cu}        |   46 +-
 tests/src/external_2.cpp                           |   36 +-
 tests/src/{external_2.cpp => external_2.cu}        |   37 +-
 tests/src/fft.cpp                                  |  169 +-
 tests/src/generator_blas1.cpp                      |  524 ++
 tests/src/generator_blas2.cpp                      |  261 +
 tests/src/generator_blas3.cpp                      |  424 +
 tests/src/global_variables.cpp                     |   85 +
 tests/src/global_variables.cu                      |   85 +
 tests/src/iterators.cpp                            |   21 +-
 tests/src/{iterators.cpp => iterators.cu}          |   21 +-
 tests/src/libviennacl_blas1.cpp                    |  668 ++
 tests/src/libviennacl_blas1.cu                     |  668 ++
 tests/src/libviennacl_blas2.cpp                    |  265 +
 tests/src/libviennacl_blas2.cu                     |  265 +
 tests/src/libviennacl_blas3.cpp                    |  623 ++
 tests/src/libviennacl_blas3.cu                     |  623 ++
 tests/src/matrix_col_double.cpp                    |   52 +
 tests/src/matrix_col_double.cu                     |   52 +
 tests/src/matrix_col_float.cpp                     |   45 +
 tests/src/matrix_col_float.cu                      |   45 +
 tests/src/matrix_col_int.cpp                       |   48 +
 tests/src/matrix_col_int.cu                        |   48 +
 tests/src/matrix_float_double.hpp                  | 1304 +++
 tests/src/matrix_int.hpp                           | 1107 +++
 tests/src/matrix_row_double.cpp                    |   51 +
 tests/src/matrix_row_double.cu                     |   51 +
 tests/src/matrix_row_float.cpp                     |   44 +
 tests/src/matrix_row_float.cu                      |   44 +
 tests/src/matrix_row_int.cpp                       |   48 +
 tests/src/matrix_row_int.cu                        |   48 +
 tests/src/matrix_vector.cpp                        | 1146 +++
 tests/src/matrix_vector.cu                         | 1146 +++
 tests/src/matrix_vector_int.cpp                    |  823 ++
 tests/src/matrix_vector_int.cu                     |  823 ++
 tests/src/nmf.cpp                                  |  103 +
 tests/src/qr_method.cpp                            |  277 +
 tests/src/scalar.cpp                               |  390 +-
 tests/src/scalar.cu                                |  461 +
 tests/src/scheduler_matrix.cpp                     |  920 ++
 tests/src/scheduler_matrix_matrix.cpp              |  954 ++
 tests/src/scheduler_matrix_vector.cpp              |  945 ++
 tests/src/scheduler_sparse.cpp                     |  456 +
 tests/src/scheduler_vector.cpp                     |  697 ++
 tests/src/sparse.cpp                               |  962 +-
 tests/src/sparse.cu                                |  891 ++
 tests/src/spmdm.cpp                                |  339 +
 tests/src/spmdm.cu                                 |  339 +
 tests/src/structured-matrices.cpp                  |  216 +-
 tests/src/svd.cpp                                  |  296 +
 tests/src/vector.cpp                               |  705 --
 tests/src/vector_double.cpp                        |   66 +
 tests/src/vector_double.cu                         |   66 +
 tests/src/vector_float.cpp                         |   62 +
 tests/src/vector_float.cu                          |   62 +
 tests/src/vector_float_double.hpp                  | 1717 ++++
 tests/src/vector_int.cpp                           | 1523 ++++
 tests/src/vector_int.cu                            | 1523 ++++
 tests/src/vector_multi_inner_prod.cpp              |  584 ++
 tests/src/vector_multi_inner_prod.cu               |  584 ++
 tests/src/vector_uint.cpp                          |  966 ++
 tests/src/vector_uint.cu                           |  966 ++
 viennacl/backend/cpu_ram.hpp                       |  143 +
 viennacl/backend/cuda.hpp                          |  190 +
 viennacl/backend/mem_handle.hpp                    |  225 +
 viennacl/backend/memory.hpp                        |  630 ++
 viennacl/backend/opencl.hpp                        |  146 +
 viennacl/backend/util.hpp                          |  280 +
 viennacl/circulant_matrix.hpp                      |  194 +-
 viennacl/compressed_compressed_matrix.hpp          |  588 ++
 viennacl/compressed_matrix.hpp                     | 1522 ++--
 viennacl/context.hpp                               |   88 +
 viennacl/coordinate_matrix.hpp                     |  812 +-
 viennacl/ell_matrix.hpp                            |  254 +
 viennacl/fft.hpp                                   |  402 +-
 viennacl/forwards.h                                |  812 +-
 viennacl/generator/autotune.hpp                    |  208 +
 viennacl/generator/forwards.h                      |  134 +
 viennacl/generator/generate.hpp                    |  408 +
 viennacl/generator/helpers.hpp                     |  286 +
 viennacl/generator/map_functor.hpp                 |  170 +
 viennacl/generator/mapped_objects.hpp              |  343 +
 viennacl/generator/matrix_product.hpp              |  716 ++
 viennacl/generator/profile_base.hpp                |  194 +
 viennacl/generator/profiles.hpp                    |  340 +
 viennacl/generator/saxpy.hpp                       |  210 +
 viennacl/generator/scalar_reduction.hpp            |  362 +
 viennacl/generator/set_arguments_functor.hpp       |  139 +
 .../generator/statement_representation_functor.hpp |  172 +
 viennacl/generator/utils.hpp                       |  274 +
 viennacl/generator/vector_reduction.hpp            |  243 +
 viennacl/hankel_matrix.hpp                         |  181 +-
 viennacl/hyb_matrix.hpp                            |  298 +-
 viennacl/io/kernel_parameters.hpp                  |  446 -
 viennacl/io/matrix_market.hpp                      |  113 +-
 viennacl/linalg/amg.hpp                            |  472 +-
 viennacl/linalg/bicgstab.hpp                       |  237 +-
 viennacl/linalg/bisect.hpp                         |  110 +-
 viennacl/linalg/cg.hpp                             |  123 +-
 viennacl/linalg/circulant_matrix_operations.hpp    |  175 +-
 viennacl/linalg/compressed_matrix_operations.hpp   |  265 -
 viennacl/linalg/cuda/common.hpp                    |  189 +
 viennacl/linalg/cuda/direct_solve.hpp              |  523 ++
 viennacl/linalg/cuda/matrix_operations.hpp         | 2539 ++++++
 viennacl/linalg/cuda/matrix_operations_col.hpp     | 1423 +++
 viennacl/linalg/cuda/matrix_operations_prod.hpp    | 2886 ++++++
 viennacl/linalg/cuda/matrix_operations_row.hpp     | 1419 +++
 viennacl/linalg/cuda/misc_operations.hpp           |   93 +
 viennacl/linalg/cuda/scalar_operations.hpp         |  380 +
 viennacl/linalg/cuda/sparse_matrix_operations.hpp  | 1831 ++++
 .../linalg/cuda/sparse_matrix_operations_solve.hpp |  761 ++
 viennacl/linalg/cuda/vector_operations.hpp         | 2790 ++++++
 viennacl/linalg/detail/amg/amg_base.hpp            |  712 +-
 viennacl/linalg/detail/amg/amg_coarse.hpp          |  921 +-
 viennacl/linalg/detail/amg/amg_debug.hpp           |   31 +-
 viennacl/linalg/detail/amg/amg_interpol.hpp        |  555 +-
 viennacl/linalg/detail/ilu/block_ilu.hpp           |  412 +
 viennacl/linalg/detail/ilu/common.hpp              |  236 +
 viennacl/linalg/detail/ilu/ilu0.hpp                |  346 +
 viennacl/linalg/detail/ilu/ilut.hpp                |  439 +
 viennacl/linalg/detail/op_applier.hpp              |  103 +
 viennacl/linalg/detail/op_executor.hpp             |   85 +
 viennacl/linalg/detail/spai/block_matrix.hpp       |   49 +-
 viennacl/linalg/detail/spai/block_vector.hpp       |   38 +-
 viennacl/linalg/detail/spai/fspai.hpp              |  219 +-
 viennacl/linalg/detail/spai/qr.hpp                 |  304 +-
 viennacl/linalg/detail/spai/small_matrix.hpp       |   37 +-
 viennacl/linalg/detail/spai/spai-dynamic.hpp       |  893 +-
 viennacl/linalg/detail/spai/spai-static.hpp        |  232 +-
 viennacl/linalg/detail/spai/spai.hpp               |  599 +-
 viennacl/linalg/detail/spai/spai_tag.hpp           |  105 +-
 viennacl/linalg/detail/spai/sparse_vector.hpp      |   60 +-
 viennacl/linalg/direct_solve.hpp                   |  779 +-
 viennacl/linalg/eig.hpp                            |   21 +-
 viennacl/linalg/gmres.hpp                          |  341 +-
 viennacl/linalg/hankel_matrix_operations.hpp       |  176 +-
 viennacl/linalg/host_based/common.hpp              |  166 +
 viennacl/linalg/host_based/direct_solve.hpp        |  418 +
 viennacl/linalg/host_based/matrix_operations.hpp   | 1177 +++
 viennacl/linalg/host_based/misc_operations.hpp     |   80 +
 viennacl/linalg/host_based/scalar_operations.hpp   |  162 +
 .../linalg/host_based/sparse_matrix_operations.hpp | 1603 ++++
 viennacl/linalg/host_based/sse_blas.hpp            | 1013 +++
 viennacl/linalg/host_based/sse_kernels.hpp         |  590 ++
 viennacl/linalg/host_based/vector_operations.hpp   |  621 ++
 viennacl/linalg/ichol.hpp                          |  228 +
 viennacl/linalg/ilu.hpp                            |    9 +-
 viennacl/linalg/inner_prod.hpp                     |  197 +-
 viennacl/linalg/jacobi_precond.hpp                 |  133 +-
 viennacl/linalg/lanczos.hpp                        |  295 +-
 viennacl/linalg/lu.hpp                             |  227 +
 viennacl/linalg/matrix_operations.hpp              | 1096 ++-
 viennacl/linalg/misc_operations.hpp                |   94 +
 viennacl/linalg/mixed_precision_cg.hpp             |  254 +
 viennacl/linalg/nmf.hpp                            |  187 +-
 viennacl/linalg/norm_1.hpp                         |   76 +-
 viennacl/linalg/norm_2.hpp                         |  159 +-
 viennacl/linalg/norm_frobenius.hpp                 |   73 +
 viennacl/linalg/norm_inf.hpp                       |   81 +-
 viennacl/linalg/opencl/common.hpp                  |   95 +
 viennacl/linalg/opencl/direct_solve.hpp            |  232 +
 .../kernels/compressed_compressed_matrix.hpp       |   89 +
 .../linalg/opencl/kernels/compressed_matrix.hpp    | 1096 +++
 .../linalg/opencl/kernels/coordinate_matrix.hpp    |  382 +
 viennacl/linalg/opencl/kernels/ell_matrix.hpp      |  195 +
 viennacl/linalg/opencl/kernels/fft.hpp             |  294 +
 viennacl/linalg/opencl/kernels/hyb_matrix.hpp      |  214 +
 viennacl/linalg/opencl/kernels/ilu.hpp             |   90 +
 viennacl/linalg/opencl/kernels/matrix.hpp          |  932 ++
 viennacl/linalg/opencl/kernels/matrix_element.hpp  |  138 +
 viennacl/linalg/opencl/kernels/matrix_prod.hpp     |  485 +
 viennacl/linalg/opencl/kernels/matrix_solve.hpp    |  212 +
 viennacl/linalg/opencl/kernels/nmf.hpp             |   82 +
 viennacl/linalg/opencl/kernels/scalar.hpp          |  266 +
 viennacl/linalg/opencl/kernels/spai.hpp            |  614 ++
 viennacl/linalg/opencl/kernels/svd.hpp             |  560 ++
 viennacl/linalg/opencl/kernels/vector.hpp          |  688 ++
 viennacl/linalg/opencl/kernels/vector_element.hpp  |  155 +
 viennacl/linalg/opencl/matrix_operations.hpp       |  998 ++
 viennacl/linalg/opencl/misc_operations.hpp         |   72 +
 viennacl/linalg/opencl/scalar_operations.hpp       |  201 +
 .../linalg/opencl/sparse_matrix_operations.hpp     |  940 ++
 .../opencl/vandermonde_matrix_operations.hpp       |   68 +
 viennacl/linalg/opencl/vector_operations.hpp       |  975 ++
 viennacl/linalg/power_iter.hpp                     |   81 +-
 viennacl/linalg/prod.hpp                           |  286 +-
 viennacl/linalg/qr-method-common.hpp               |  225 +
 viennacl/linalg/qr-method.hpp                      |  952 ++
 viennacl/linalg/qr.hpp                             |  594 +-
 viennacl/linalg/row_scaling.hpp                    |  209 +-
 viennacl/linalg/scalar_operations.hpp              |  242 +
 viennacl/linalg/spai.hpp                           |  151 +-
 viennacl/linalg/sparse_matrix_operations.hpp       |  375 +
 viennacl/linalg/svd.hpp                            |  404 +
 viennacl/linalg/toeplitz_matrix_operations.hpp     |  190 +-
 viennacl/linalg/tred2.hpp                          |   68 +
 viennacl/linalg/vandermonde_matrix_operations.hpp  |  185 +-
 viennacl/linalg/vector_operations.hpp              | 1169 ++-
 viennacl/matrix.hpp                                | 3051 +++++++
 viennacl/matrix_proxy.hpp                          |  489 +-
 viennacl/meta/enable_if.hpp                        |   90 +-
 viennacl/meta/predicate.hpp                        |  514 ++
 viennacl/meta/result_of.hpp                        |  634 ++
 viennacl/meta/tag_of.hpp                           |  105 +-
 viennacl/misc/bandwidth_reduction.hpp              |   13 +-
 viennacl/misc/cuthill_mckee.hpp                    |  772 +-
 viennacl/misc/gibbs_poole_stockmeyer.hpp           |  187 +-
 viennacl/ocl/backend.hpp                           |  122 +-
 viennacl/ocl/command_queue.hpp                     |  182 +-
 viennacl/ocl/context.hpp                           |  460 +-
 viennacl/ocl/device.hpp                            | 1722 +++-
 viennacl/ocl/device_utils.hpp                      |  155 +
 viennacl/ocl/enqueue.hpp                           |  132 +
 viennacl/ocl/error.hpp                             | 1260 +--
 viennacl/ocl/forwards.h                            |   17 +-
 viennacl/ocl/handle.hpp                            |  422 +-
 viennacl/ocl/infos.hpp                             |  268 +
 viennacl/ocl/kernel.hpp                            |  839 ++
 viennacl/ocl/local_mem.hpp                         |   21 +-
 viennacl/ocl/platform.hpp                          |   75 +-
 viennacl/ocl/program.hpp                           |   66 +-
 viennacl/ocl/utils.hpp                             |   48 +-
 viennacl/rand/gaussian.hpp                         |   54 +
 viennacl/rand/uniform.hpp                          |   56 +
 viennacl/rand/utils.hpp                            |   71 +
 viennacl/range.hpp                                 |   27 +-
 viennacl/scalar.hpp                                | 1269 +--
 viennacl/scheduler/execute.hpp                     |  247 +
 viennacl/scheduler/execute_axbx.hpp                |  379 +
 viennacl/scheduler/execute_elementwise.hpp         |  466 +
 viennacl/scheduler/execute_generic_dispatcher.hpp  |  135 +
 viennacl/scheduler/execute_matrix_dispatcher.hpp   |  210 +
 viennacl/scheduler/execute_matrix_prod.hpp         |  498 +
 viennacl/scheduler/execute_scalar_assign.hpp       |  189 +
 viennacl/scheduler/execute_scalar_dispatcher.hpp   |  131 +
 viennacl/scheduler/execute_util.hpp                |  253 +
 viennacl/scheduler/execute_vector_dispatcher.hpp   |  191 +
 viennacl/scheduler/forwards.h                      |  710 ++
 viennacl/scheduler/io.hpp                          |  290 +
 viennacl/slice.hpp                                 |   46 +-
 viennacl/toeplitz_matrix.hpp                       |  191 +-
 viennacl/tools/adapter.hpp                         |  258 +-
 viennacl/tools/entry_proxy.hpp                     |  144 +-
 viennacl/tools/matrix_size_deducer.hpp             |  199 +-
 .../tools/matrix_solve_kernel_class_deducer.hpp    |   77 -
 viennacl/tools/shared_ptr.hpp                      |  163 +
 viennacl/tools/timer.hpp                           |  122 +
 viennacl/tools/tools.hpp                           |  292 +
 viennacl/traits/clear.hpp                          |  147 +-
 viennacl/traits/context.hpp                        |   66 +
 viennacl/traits/fill.hpp                           |  139 +-
 viennacl/traits/handle.hpp                         |  248 +
 viennacl/traits/size.hpp                           |  323 +
 viennacl/traits/start.hpp                          |  104 +
 viennacl/traits/stride.hpp                         |   78 +
 viennacl/vandermonde_matrix.hpp                    |  182 +-
 viennacl/vector.hpp                                | 3243 +++++++
 viennacl/vector_proxy.hpp                          |  192 +-
 546 files changed, 131762 insertions(+), 35933 deletions(-)

diff --cc CMakeLists.txt
index 6011ec2,bd38b04..af714cd

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -30,8 -30,8 +30,13 @@@ ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "Dar
  ################
  
  set(VERSION_MAJOR 1)
++<<<<<<< HEAD
 +set(VERSION_MINOR 3)
 +set(VERSION_PATCH 0)
++=======
+ set(VERSION_MINOR 5)
+ set(VERSION_PATCH 1)
++>>>>>>> upstream/1.5.1
  set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH})
  
  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --cc README
index e23d6ce,b4ea993..1897af3
--- a/README
+++ b/README
@@@ -26,9 -26,12 +26,18 @@@ ViennaCL requires the following
  The first step is to extract the file:
  
  Unix-based OS:
++<<<<<<< HEAD
 +$> gunzip ViennaCL-1.3.0.tar.gz
 +$> tar -xf ViennaCL-1.3.0.tar
 +$> cd ViennaCL-1.3.0
++=======
+ $> gunzip ViennaCL-1.5.1.tar.gz
+ $> tar -xf ViennaCL-1.5.1.tar
+ $> cd ViennaCL-1.5.1
+ 
+ Windows:
+ Extract the file using your favorite compressor/decompressor, e.g. 7-zip.
++>>>>>>> upstream/1.5.1
  
  ViennaCL is a header-only library, therefore it is sufficient to copy the subfolder viennacl/ (holding the header files) into you project directory or your system include directory. For instructions on how to set the include paths correctly, please refer to the documentation of your compiler.
  
diff --cc changelog
index fc0d0c1,c3fdc04..bf2bae1
--- a/changelog
+++ b/changelog
@@@ -2,8 -2,141 +2,146 @@@
  **** ViennaCL Change Logs ****
  ******************************
  
++<<<<<<< HEAD
 +*** Version 1.3.x ***
 +
++=======
+ *** Version 1.5.x ***
+ 
+ -- Version 1.5.1 --
+ This maintenance release fixes a few nasty bugs:
+  - Fixed a memory leak in the OpenCL kernel generator. Thanks to GitHub user dxyzab for spotting this.
+  - Added compatibility of the mixed precision CG implementation with older AMD GPUs. Thanks to Andreas Rost for the input.
+  - Fixed an error when running the QR factorization for matrices with less rows than columns. Thanks to Karol Polko for reporting.
+  - Readded accidentally removed chapters on additional algorithms and structured matrices to the manual. Thanks to Sajjadul Islam for the hint.
+  - Fixed buggy OpenCL kernels for matrix additions and subtractions for column-major matrices. Thanks to Tom Nicholson for reporting.
+  - Fixed an invalid default kernel parameter set for matrix-matrix multiplications on CPUs when using the OpenCL backend. Thanks again to Tom Nicholson.
+  - Corrected a weak check used in two tests. Thanks to Walter Mascarenhas for providing a fix.
+  - Fixed a wrong global work size inside the SPAI preconditioner. Thanks to Andreas Rost.
+ 
+ 
+ -- Version 1.5.0 --
+ This new minor release number update focuses on a more powerful API, and on first steps in making ViennaCL more accessible from languages other than C++.
+ In addition to many internal improvements both in terms of performance and flexibility, the following changes are visible to users:
+  - API-change: User-provided OpenCL kernels extract their kernels automatically. A call to add_kernel() is now obsolete, hence the function was removed.
+  - API-change: Device class has been extend and supports all informations defined in the OpenCL 1.1 standard through member functions. Duplicate compute_units() and max_work_group_size() have been removed (thanks for Shantanu Agarwal for the input).
+  - API-change: viennacl::copy() from a ViennaCL object to an object of non-ViennaCL type no longer tries to resize the object accordingly. An assertion is thrown if the sizes are incorrect in order to provide a consistent behavior across many different types.
+  - Datastructure change: Vectors and matrices are now padded with zeros by default, resulting in higher performance particularly for matrix operations. This padding needs to be taken into account when using fast_copy(), particularly for matrices.
+  - Fixed problems with CUDA and CMake+CUDA on Visual Studio.
+  - coordinate_matrix<> now also behaves correctly for tiny matrix dimensions.
+  - CMake 2.6 as new minimum requirement instead of CMake 2.8.
+  - Vectors and matrices can be instantiated with integer template types (long, int, short, char).
+  - Added support for element_prod() and element_div() for dense matrices.
+  - Added element_pow() for vectors and matrices.
+  - Added norm_frobenius() for computing the Frobenius norm of dense matrices.
+  - Added unary element-wise operations for vectors and dense matrices: element_sin(), element_sqrt(), etc.
+  - Multiple OpenCL contexts can now be used in a multi-threaded setting (one thread per context).
+  - Multiple inner products with a common vector can now be computed efficiently via e.g.~inner_prod(x, tie(y, z));
+  - Added support for prod(A, B), where A is a sparse matrix type and B is a dense matrix (thanks to Albert Zaharovits for providing parts of the implementation).
+  - Added diag() function for extracting the diagonal of a vector to a matrix, or for generating a square matrix from a vector with the vector elements on a diagonal (similar to MATLAB).
+  - Added row() and column() functions for extracting a certain row or column of a matrix to a vector.
+  - Sparse matrix-vector products now also work with vector strides and ranges.
+  - Added async_copy() for vectors to allow for a better overlap of computation and communication.
+  - Added compressed_compressed_matrix type for the efficient representation of CSR matrices with only few nonzero rows.
+  - Added possibility to switch command queues in OpenCL contexts.
+  - Improved performance of Block-ILU by removing one spurious conversion step.
+  - Improved performance of Cuthill-McKee algorithm by about 40 percent.
+  - Improved performance of power iteration by avoiding the creation of temporaries in each step.
+  - Removed spurious status message to cout in matrix market reader and nonnegative matrix factorization.
+  - The OpenCL kernel launch logic no longer attempts to re-launch the kernel with smaller work sizes if an error is encountered (thanks to Peter Burka for pointing this out).
+  - Reduced overhead for lenghty expressions involving temporaries (at the cost of increased compilation times).
+  - vector and matrix are now padded to dimensions being multiples of 128 per default. This greatly improves GEMM performance for arbitrary sizes.
+  - Loop indices for OpenMP parallelization are now all signed, increasing compatibility with older OpenMP implementations (thanks to Mrinal Deo for the hint).
+  - Complete rewrite of the generator. Now uses the scheduler for specifying the operation. Includes a full device database for portable high performance of GEMM kernels.
+  - Added micro-scheduler for attaching the OpenCL kernel generator to the user API.
+  - Certain BLAS functionality in ViennaCL is now also available through a shared library (libviennacl).
+  - Removed the external kernel parameter tuning factility, which is to be replaced by an internal device database through the kernel generator.
+  - Completely eliminated the OpenCL kernel conversion step in the developer repository and the source-release. One can now use the developer version without the need for a Boost installation.
+ 
+ 
+ *** Version 1.4.x ***
+ 
+ -- Version 1.4.2 --
+ This is a maintenance release, particularly resolving compilation problems with Visual Studio 2012.
+ - Largely refactored the internal code base, unifying code for vector, vector_range, and vector_slice.
+   Similar code refactoring was applied to matrix, matrix_range, and matrix_slice.
+   This not only resolves the problems in VS 2012, but also leads to shorter compilation times and a smaller code base.
+ - Improved performance of matrix-vector products of compressed_matrix on CPUs using OpenCL.
+ - Resolved a bug which shows up if certain rows and columns of a compressed_matrix are empty and the matrix is copied back to host.
+ - Fixed a bug and improved performance of GMRES. Thanks to Ivan Komarov for reporting via sourceforge.
+ - Added additional Doxygen documentation.
+ 
+ -- Version 1.4.1 --
+ This release focuses on improved stability and performance on AMD devices rather than introducing new features:
+ - Included fast matrix-matrix multiplication kernel for AMD's Tahiti GPUs if matrix dimensions are a multiple of 128.
+   Our sample HD7970 reaches over 1.3 TFLOPs in single precision and 200 GFLOPs in double precision (counting multiplications and additions as separate operations).
+ - All benchmark FLOPs are now using the common convention of counting multiplications and additions separately (ignoring fused multiply-add).
+ - Fixed a bug for matrix-matrix multiplication with matrix_slice<> when slice dimensions are multiples of 64.
+ - Improved detection logic for Intel OpenCL SDK.
+ - Fixed issues when resizing an empty compressed_matrix.
+ - Fixes and improved support for BLAS-1-type operations on dense matrices and vectors.
+ - Vector expressions can now be passed to inner_prod() and norm_1(), norm_2() and norm_inf() directly.
+ - Improved performance when using OpenMP.
+ - Better support for Intel Xeon Phi (MIC).
+ - Resolved problems when using OpenCL for CPUs if the number of cores is not a power of 2.
+ - Fixed a flaw when using AMG in debug mode. Thanks to Jakub Pola for reporting.
+ - Removed accidental external linkage (invalidating header-only model) of SPAI-related functions. Thanks again to Jakub Pola.
+ - Fixed issues with copy back to host when OpenCL handles are passed to CTORs of vector, matrix, or compressed_matrix. Thanks again to Jakub Pola.
+ - Added fix for segfaults on program exit when providing custom OpenCL queues. Thanks to Denis Demidov for reporting.
+ - Fixed bug in copy() to hyb_matrix as reported by Denis Demidov (thanks!).
+ - Added an overload for result_of::alignment for vector_expression. Thanks again to Denis Demidov.
+ - Added SSE-enabled code contributed by Alex Christensen.
+ 
+ -- Version 1.4.0 --
+ The transition from 1.3.x to 1.4.x features the largest number of additions, improvements, and cleanups since the initial release.
+ In particular, host-, OpenCL-, and CUDA-based execution is now supported. OpenCL now needs to be enabled explicitly!
+ New features and feature improvements are as follows:
+ - Added host-based and CUDA-enabled operations on ViennaCL objects. The default is now a host-based execution for reasons of compatibility.
+   Enable OpenCL- or CUDA-based execution by defining the preprocessor constant VIENNACL_WITH_OPENCL and VIENNACL_WITH_CUDA respectively.
+   Note that CUDA-based execution requires the use of nvcc.
+ - Added mixed-precision CG solver (OpenCL-based).
+ - Greatly improved performance of ILU0 and ILUT preconditioners (up to 10-fold). Also fixed a bug in ILUT.
+ - Added initializer types from Boost.uBLAS (unit_vector, zero_vector, scalar_vector, identity_matrix, zero_matrix, scalar_matrix).
+   Thanks to Karsten Ahnert for suggesting the feature.
+ - Added incomplete Cholesky factorization preconditioner.
+ - Added element-wise operations for vectors as available in Boost.uBLAS (element_prod, element_div).
+ - Added restart-after-N-cycles option to BiCGStab.
+ - Added level-scheduling for ILU-preconditioners. Performance strongly depends on matrix pattern.
+ - Added least-squares example including a function inplace_qr_apply_trans_Q() to compute the right hand side vector Q^T b without rebuilding Q.
+ - Improved performance of LU-factorization of dense matrices.
+ - Improved dense matrix-vector multiplication performance (thanks to Philippe Tillet).
+ - Reduced overhead when copying to/from ublas::compressed_matrix.
+ - ViennaCL objects (scalar, vector, etc.) can now be used as global variables (thanks to an anonymous user on the support-mailinglist).
+ - Refurbished OpenCL vector kernels backend.
+   All operations of the type v1 = a v2 @ b v3 with vectors v1, v2, v3 and scalars a and b including += and -= instead of = are now temporary-free. Similarly for matrices.
+ - matrix_range and matrix_slice as well as vector_range and vector_slice can now be used and mixed completely seamlessly with all standard operations except lu_factorize().
+ - Fixed a bug when using copy() with iterators on vector proxy objects.
+ - Final reduction step in inner_prod() and norms is now computed on CPU if the result is a CPU scalar.
+ - Reduced kernel launch overhead of simple vector kernels by packing multiple kernel arguments together.
+ - Updated SVD code and added routines for the computation of symmetric eigenvalues using OpenCL.
+ - custom_operation's constructor now support multiple arguments, allowing multiple expression to be packed in the same kernel for improved performances.
+   However, all the datastructures in the multiple operations must have the same size.
+ - Further improvements to the OpenCL kernel generator: Added a repeat feature for generating loops inside a kernel, added element-wise products and division, added support for every one-argument OpenCL function.
+ - The name of the operation is now a mandatory argument of the constructor of custom_operation.
+ - Improved performances of the generated matrix-vector product code.
+ - Updated interfacing code for the Eigen library, now working with Eigen 3.x.y.
+ - Converter in source-release now depends on Boost.filesystem3 instead of Boost.filesystem2, thus requiring Boost 1.44 or above.
+ 
+ *** Version 1.3.x ***
+ 
+ -- Version 1.3.1 --
+ The following bugfixes and enhancements have been applied:
+ - Fixed a compilation problem with GCC 4.7 caused by the wrong order of function declarations. Also removed unnecessary indirections and unused variables.
+ - Improved out-of-source build in the src-version (for packagers).
+ - Added virtual destructor in the runtime_wrapper-class in the kernel generator.
+ - Extended flexibility of submatrix and subvector proxies (ranges, slices).
+ - Block-ILU for compressed_matrix is now applied on the GPU during the solver cycle phase. However, for the moment the implementation file in viennacl/linalg/detail/ilu/opencl block ilu.hpp needs to be included separately in order to avoid an OpenCL dependency for all ILU implementations.
+ - SVD now supports double precision.
+ - Slighly adjusted the interface for NMF. The approximation rank is now specified by the supplied matrices W and H.
+ - Fixed a problem with matrix-matrix products if the result matrix is not initialized properly (thanks to Laszlo Marak for finding the issue and a fix).
+ - The operations C += prod(A, B) and C −= prod(A, B) for matrices A, B, and C no longer introduce temporaries if the three matrices are distinct.
+ 
++>>>>>>> upstream/1.5.1
  -- Version 1.3.0 --
  Several new features enter this new minor version release.
  Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
diff --cc doc/Doxyfile.in
index bdca248,2edec42..e68a17f
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@@ -27,15 -27,15 +27,19 @@@ DOXYFILE_ENCODING      = UTF-
  
  PROJECT_NAME           = "ViennaCL - The Vienna Computing Library"
  
- # The PROJECT_NUMBER tag can be used to enter a project or revision number. 
- # This could be handy for archiving the generated documentation or 
+ # The PROJECT_NUMBER tag can be used to enter a project or revision number.
+ # This could be handy for archiving the generated documentation or
  # if some version control system is used.
  
++<<<<<<< HEAD
 +PROJECT_NUMBER         = 1.3.0
++=======
+ PROJECT_NUMBER         = 1.5.1
++>>>>>>> upstream/1.5.1
  
- # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
- # base path where the generated documentation will be put. 
- # If a relative path is entered, it will be relative to the location 
+ # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+ # base path where the generated documentation will be put.
+ # If a relative path is entered, it will be relative to the location
  # where doxygen was started. If left blank the current directory will be used.
  
  OUTPUT_DIRECTORY       = doxygen/
diff --cc doc/manual/algorithms.tex
index 70f00d1,4dba1a5..1d2100b
--- a/doc/manual/algorithms.tex
+++ b/doc/manual/algorithms.tex
@@@ -181,31 -202,31 +202,59 @@@ second parameter specifies the drop tol
  \subsection{Incomplete LU Factorization with Static Pattern (ILU0)}
  Similar to ILUT, ILU0 computes an approximate LU factorization with sparse factors L and U.
  While ILUT determines the location of nonzero entries on the fly, ILU0 uses the sparsity pattern of A for the sparsity pattern of L and U \cite{saad-iterative-solution}.
+ Due to the serial nature of the preconditioner, the setup of ILU0 is computed on the CPU.
+ \begin{lstlisting}
+ //compute ILU0 preconditioner:
+ viennacl::linalg::ilu0_tag ilu0_config;
+ viennacl::linalg::ilu0_precond< SparseMatrix > vcl_ilut(vcl_matrix,
+                                                         ilu0_config);
+ 
+ //solve (e.g. using conjugate gradient solver)
+ vcl_result = viennacl::linalg::solve(vcl_matrix,
+                                      vcl_rhs,
+                                      viennacl::linalg::bicgstab_tag(),
+                                      vcl_ilut);   //preconditioner here
+ \end{lstlisting}
+ The triangular substitutions may be applied in parallel on GPUs by enabling \emph{level-scheduling} \cite{saad-iterative-solution} via the member function call \lstinline|use_level_scheduling(true)| in the \lstinline|ilu0_config| object.
+ 
+ One parameter can be passed to the constructor of \lstinline|ilu0_tag|, being the boolean specifying whether level scheduling should be used.
+ 
++<<<<<<< HEAD
++\subsection{Incomplete LU Factorization with Static Pattern (ILU0)}
++Similar to ILUT, ILU0 computes an approximate LU factorization with sparse factors L and U.
++While ILUT determines the location of nonzero entries on the fly, ILU0 uses the sparsity pattern of A for the sparsity pattern of L and U \cite{saad-iterative-solution}.
 +Due to the serial nature of the preconditioner, the setup as well as each application of ILU0 to the residual is computed on
 +the CPU.
 +
 +\begin{lstlisting}
 +//compute ILU0 preconditioner:
 +ilu0_precond< SparseMatrix > vcl_ilu0(vcl_matrix,
 +                                      viennacl::linalg::ilu0_tag());
 +
 +//solve (e.g. using conjugate gradient solver)
 +vcl_result = viennacl::linalg::solve(vcl_matrix,
 +                                     vcl_rhs,
 +                                     viennacl::linalg::bicgstab_tag(),
 +                                     vcl_ilut);   //preconditioner here
 +\end{lstlisting}
 +Two parameters can be passed to the constructor of \lstinline|ilu0_tag|:
 +The first parameter specifies the lower row and column index for which ILU0 should be computed, while the second parameter specifies the upper bound on the row and column indices considered.
 +For example, the parameter set $(2,5)$ is supplied, ILU0 is computed for the diagonal block $A(2:4, 2:4)$, where $2:4 = \{2, 3, 4 \}$.
 +By default, ILU0 is computed for the full system matrix.
++=======
+ \TIP{The performance of level scheduling depends strongly on the matrix pattern and is thus disabled by default.}
++>>>>>>> upstream/1.5.1
  
  \subsection{Block-ILU}
  To overcome the serial nature of ILUT and ILU0 applied to the full system matrix,
  a parallel variant is to apply ILU to diagonal blocks of the system matrix.
  This is accomplished by the \lstinline|block_ilu| preconditioner, which takes
  the system matrix type as first template argument and the respective ILU-tag type as second template argument
++<<<<<<< HEAD
 +(either \lstinline|ilut_tag| or \lstinline|ilu0_tag|). 
++=======
+ (either \lstinline|ilut_tag| or \lstinline|ilu0_tag|). Support for accelerators using {\CUDA} or {\OpenCL} is provided.
++>>>>>>> upstream/1.5.1
  
  \begin{lstlisting}
  //compute block-ILU preconditioner using ILU0 for each block:
@@@ -219,9 -240,10 +268,16 @@@ vcl_result = viennacl::linalg::solve(vc
                                       viennacl::linalg::bicgstab_tag(),
                                       vcl_block_ilu0);
  \end{lstlisting}
++<<<<<<< HEAD
 +A third argument can be passed to the constructor of \lstinline|block_ilu_precond|: 
 +Either the number of blocks to be used (defaults to 4), or an index vector with fine-grained control over the blocks. Refer to the Doxygen pages in doc/doxygen for details.
 +
++=======
+ A third argument can be passed to the constructor of \lstinline|block_ilu_precond|:
+ Either the number of blocks to be used (defaults to $8$), or an index vector with fine-grained control over the blocks. Refer to the Doxygen pages in doc/doxygen for details.
+ 
+ \TIP{The number of blocks is a design parameter for your sparse linear system at hand. Higher number of blocks leads to better memory bandwidth utilization on GPUs, but may increase the number of solver iterations.}
++>>>>>>> upstream/1.5.1
  
  \subsection{Jacobi Preconditioner}
  A Jacobi preconditioner is a simple diagonal preconditioner given by the reciprocals of the diagonal entries of the system matrix $A$.
@@@ -256,235 -278,63 +312,117 @@@ vcl_result = viennacl::linalg::solve(vc
  The tag \lstinline|viennacl::linalg::row_scaling_tag()| can be supplied with a parameter denoting the norm to be used. A value of \lstinline|1| specifies the
  $l^1$-norm, while a value of $2$ selects the $l^2$-norm (default).
  
- \subsection{Algebraic Multigrid}
- \NOTE{Algebraic Multigrid preconditioners are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
- be included in future releases!}
- 
- \NOTE{Algebraic Multigrid preconditioners depend on {\ublas}.}
- 
- Algebraic multigrid mimics the behavior of geometric multigrid on the algebraic level and is thus suited for black-box purposes, where only the system matrix
- and the right hand side vector are available \cite{trottenberg:multigrid}. Many different flavors of the individual multigrid ingredients exists
- \cite{yang:parallel-amg}, of which the most common ones are implemented in {\ViennaCL}.
  
- The two main ingredients of algebraic multigrid are a coarsening algorithm and an interpolation algorithm. The available coarsening methods are listed in
- Tab.~\ref{tab:amg-coarsening}.
- \begin{table}[tbp]
- \begin{center}
- \begin{tabular}{l|l}
- Description & {\ViennaCL} option constant \\
- \hline
- Classical Ruge-St\"uben (RS) & \lstinline|VIENNACL_AMG_COARSE_RS| \\
- One-Pass & \lstinline|VIENNACL_AMG_COARSE_ONEPASS| \\
- RS0 & \lstinline|VIENNACL_AMG_COARSE_RS0| \\
- RS3 & \lstinline|VIENNACL_AMG_COARSE_RS3| \\
- Aggregation & \lstinline|VIENNACL_AMG_COARSE_AG| \\
- Smoothed aggregation & \lstinline|VIENNACL_AMG_COARSE_SA| \\
- \end{tabular}
- \caption{AMG coarsening methods available in {\ViennaCL}. Per default, classical RS coarsening is used.\label{tab:amg-coarsening}}
- \end{center}
- \end{table}
- The available interpolation methods are given in Tab.~\ref{tab:amg-interpolation}.
- \begin{table}[tbp]
- \begin{center}
- \begin{tabular}{l|l}
- Description & {\ViennaCL} option constant \\
- \hline
- Direct & \lstinline|VIENNACL_AMG_INTERPOL_DIRECT| \\
- Classic & \lstinline|VIENNACL_AMG_INTERPOL_ONEPASS| \\
- RS0 coarsening & \lstinline|VIENNACL_AMG_INTERPOL_RS0| \\
- RS3 coarsening & \lstinline|VIENNACL_AMG_INTERPOL_RS3| \\
- \end{tabular}
- \caption{AMG interpolation methods available in {\ViennaCL}. Per default, direct interpolation is used.\label{tab:amg-interpolation}}
- \end{center}
- \end{table}
- In addition, the following parameters can be controlled in the \lstinline|amg_tag| and can be passed to the constructor:
+ \section{Eigenvalue Computations}
+ %{\ViennaCL}
+ Two algorithms for the computations of the eigenvalues of a matrix $A$ are implemented in {\ViennaCL}:
  \begin{itemize}
-  \item Strength of dependence threshold (default: $0.25$)
-  \item Interpolation weight (default: $1$)
-  \item Jacobi smoother weight (default: $1$)
-  \item Number of pre-smoothing steps (default: $1$)
-  \item Number of post-smoothing steps (default: $1$)
-  \item Number of coarse levels
+ \item The Power Iteration \cite{golub:matrix-computations}
+ \item The Lanczos Algorithm \cite{simon:lanczos-pro}
  \end{itemize}
+ Depending on the parameter \lstinline|tag| either one of them is called.
+ Both algorithms can be used for either {\ublas} or {\ViennaCL} compressed matrices.\\
+ In order to get the eigenvalue with the greatest absolut value the power iteration should be called. \\
+ The Lanczos algorithm returns a vector of the largest eigenvalues with the same type as the entries of the matrix.
  
- \TIP{Note that the efficiency of the various AMG flavors are typically highly problem-specific. Therefore, failure of one method for a particular problem does
- NOT imply that other coarsening or interpolation strategies will fail as well.}
- 
- \subsection{Sparse Approximate Inverses}
- 
- \NOTE{Sparse Approximate Inverse preconditioners are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
- be included in future releases!}
- 
- \NOTE{Sparse Approximate Inverse preconditioners depend on {\ublas}.}
- 
- An alternative construction of a preconditioner for a sparse system matrix $A$ is to compute a matrix $M$ with a prescribed sparsity pattern such that
- \begin{align}
-  \Vert AM - I \Vert_F \rightarrow \min \ ,
- \end{align}
- where $\Vert \cdot \Vert_F$ denotes the Frobenius norm.
- This is the basic idea of sparse approximate inverse (SPAI) preconditioner. It becomes increasingly attractive because of their inherent high degree of
- parallelism, since the minimization problem can be solved independently for each column of $M$. {\ViennaCL} provides two preconditioners of
- this family: The first is the classical SPAI algorithm as described by Grote and Huckle \cite{grote:spai}, the second is the factored SPAI (FSPAI) for symmetric
- matrices as proposed by Huckle \cite{huckle:fspai}.
- 
- SPAI can be employed for a CPU matrix \lstinline|M| of type \lstinline|MatrixType| as follows:
- \begin{lstlisting}
- // setup SPAI preconditioner, purely CPU-based
- viennacl::linalg::spai_precond<MatrixType> 
-   spai_cpu(M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
- 
- //solve (e.g. using stab. Bi-conjugate gradient solver)
- vcl_result = viennacl::linalg::solve(M,
-                                      rhs,
-                                      viennacl::linalg::bicgstab_tag(),
-                                      spai_cpu);
- \end{lstlisting}
- The first parameter denotes the residual norm threshold for the full matrix, the second parameter the maximum number of pattern updates, and the third
- parameter is the threshold for the residual of each minimization problem.
- 
- For GPU-matrices, only parts of the setup phase are computed on the CPU, because compute-intensive tasks can be carried out on the GPU:
- \begin{lstlisting}
- // setup SPAI preconditioner, GPU-assisted
- viennacl::linalg::spai_precond<GPUMatrixType> 
-   spai_gpu(vcl_matrix, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
- 
- //solve (e.g. using conjugate gradient solver)
- vcl_result = viennacl::linalg::solve(vcl_matrix,
-                                      vcl_rhs,
-                                      viennacl::linalg::bicgstab_tag(),
-                                      spai_gpu);
- \end{lstlisting}
- The \lstinline|GPUMatrixType| is typically a \lstinline|viennacl::compressed_matrix| type.
- 
- For symmetric matrices, FSPAI can be used with the conjugate gradient solver:
+ The algorithms are called for a matrix object \lstinline|A| by
  \begin{lstlisting}
- viennacl::linalg::fspai_precond<MatrixType> fspai_cpu(M, viennacl::linalg::fspai_tag());
- 
- //solve (e.g. using stab. Bi-conjugate gradient solver)
- vcl_result = viennacl::linalg::solve(M,
-                                      rhs,
-                                      viennacl::linalg::cg_tag(),
-                                      fspai_cpu);
+ std::vector<double> largest_eigenvalues = viennacl::linalg::eig(A, ltag);
+ double largest_eigenvalue = viennacl::linalg::eig(A, ptag);
  \end{lstlisting}
- Our experience is that FSPAI is typically more efficient than SPAI when applied to the same matrix, both in computational effort and in terms of convergence
- acceleration of the iterative solvers. 
- 
- \NOTE{At present, there is no GPU-accelerated FSPAI included in {\ViennaCL}.}
- 
- Note that FSPAI depends on the ordering of the unknowns, thus bandwidth reduction algorithms may be employed first, cf.~Sec.~\ref{sec:bandwidth-reduction}.
  
  
- \section{Fast Fourier Transform}
- \NOTE{The fast Fourier transform is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
- be included in future releases!}
- 
- Since there is no standardized complex type in {\OpenCL} at the time of the release of {\ViennaCLversion}, vectors need to be set up with real- and imaginary
- part before computing a fast Fourier tranform (FFT). In order to store complex numbers $z_0$, $z_1$, etc.~in a \lstinline|viennacl::vector|, say \lstinline|v|,
- the real and imaginary parts are mapped to even and odd entries of \lstinline|v| respectively: \lstinline|v[0] = Real(z_0)|, \lstinline|v[1] = Imag(z_0)|,
- \lstinline|v[2] = Real(z_1)|, \lstinline|v[3] = Imag(z_1)|, etc.
- 
- The FFT of \lstinline|v| can then be computed either by writing to a second vector \lstinline|output| or by directly writing the result to \lstinline|v|
- \begin{lstlisting}
-  viennacl::fft(v, output);
-  viennacl::inplace_fft(v);
- \end{lstlisting}
- Conversely, the inverse FFT is computed as
+ \subsection{Power Iteration}
+ The Power iteration aims at computing the eigenvalues of a matrix by calculating the product of the matrix and a vector for several times, where the resulting vector is used for the next product of the matrix and so on. The computation stops as soon as the norm of the vector converges. \\
+ The final vector is the eigenvector to the eigenvalue with the greatest absolut value.\\
+ To call this algorithm, \lstinline|piter_tag| must be used.
+ This tag has only one parameter: \\ \lstinline|terminationfactor| defines the accuracy of the computation, i.e. if the new norm of the eigenvector changes less than this parameter the computation stops and returns the corresponding eigenvalue (default: $1e-10$).\\
+ The call of the constructor may look like the following:
  \begin{lstlisting}
-  viennacl::ifft(v, output);
-  viennacl::inplace_ifft(v);
+ viennacl::linalg::piter_tag ptag(1e-8);
  \end{lstlisting}
  
- \NOTE{In {\ViennaCLversion} the FFT with complexity $N \log N$ is computed for vectors with a size of a power of two only. For other vector sizes, a standard
- discrete Fourier transform with complexity $N^2$ is employed. This is subject to change in future versions.}
- 
- \section{Bandwidth Reduction} \label{sec:bandwidth-reduction}
- \NOTE{Bandwidth reduction algorithms are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
- be included in future releases!}
+ \TIP{Example code can be found in \lstinline|examples/tutorial/power-iter.cpp|.}
  
- The bandwidth of a sparse matrix is defined as the maximum difference of the indices of nonzero entries in a row, taken over all rows. A low bandwidth
- typically allows for the use of efficient banded matrix solvers instead of iterative solvers. Moreover, better cache utilization as well as lower fill-in in
- LU-factorization based algorithms can be expected.
+ \subsection{The Lanczos Algorithm}
+ In order to compute the eigenvalues of a sparse high-dimensional matrix the Lanczos algorithm can be used to find these.
+ This algorithm reformulates the given high-dimensional matrix in a way such that the matrix can be rewritten in a tridiagonal matrix at much lower dimension.
+ The eigenvalues of this tridiagonal matrix are equal to the largest eigenvalues of the original matrix. \\
+ The eigenvalues of the tridiagonal matrix are calculated by using the bisection method \cite{golub:matrix-computations}. \\
+ To call this Lanczos algorithm, \lstinline|lanczos_tag| must be used.
+ This tag has several parameters that can be passed to the constructor:
  
- For a given sparse matrix with large bandwidth, {\ViennaCL} provides routines for renumbering the unknowns such that the reordered system matrix shows much
- smaller bandwidth. Typical applications stem from the discretization of partial differential equations by means of the finite element or the finite difference
- method. The algorithms employed are as follows:
  \begin{itemize}
-  \item Classical Cuthill-McKee algorithm \cite{cuthill:reducing-bandwidth}
-  \item Modified Cuthill-McKee algorithm \cite{cuthill:reducing-bandwidth}
-  \item Gibbs-Poole-Stockmeyer algorithm, cf.~\cite{lewis:gps-algorithm}
+  \item The exponent of epsilon for the tolerance of the reorthogonalization, defined by the parameter \lstinline|factor| (default: $0.75$)
+  \item The method of the Lanczos algorithm: $0$ uses partial reorthogonalization, $1$ full reothogonalization and $2$ does not use reorthogonalization (default: $0$)
+  \item The number of eigenvalues that are returned is specified by \lstinline|num_eigenvalues| (default: $10$)
+  \item The size of the krylov space used for the computations can be set by the parameter \lstinline|krylov_size| (default: $100$). The maximum number of iterations can be equal or less this parameter
  \end{itemize}
- The modified Cuthill-McKee algorithm also takes nodes with small, but not necessarily minimal degree as root node into account and may lead to better results
- than the classical Cuthill-McKee algorithm. A parameter $a \in [0,1]$ controls the number of nodes considered: All nodes with degree $d$ fulfilling
- \begin{align*}
-  d_{\min} \leq d \leq d_{\min} + a(d_{\max} - d_{\min})
- \end{align*}
- are considered, where $d_{\min}$ and $d_{\max}$ are the miminum and maximum nodal degrees in the graph. A second parameter \lstinline|gmax| specifies the
- number of additional root nodes considered.
- 
- The algorithms are called for a \lstinline|matrix| of a type compatible with \lstinline|std::vector< std::map<int, double> >| by
+ The call of the constructor may look like the following:
  \begin{lstlisting}
-  r = viennacl::reorder(matrix, viennacl::cuthill_mckee_tag());
-  r = viennacl::reorder(matrix, 
-                        viennacl::advanced_cuthill_mckee_tag(a, gmax));
-  r = viennacl::reorder(matrix, viennacl::gibbs_poole_stockmeyer_tag());
+ viennacl::linalg::lanczos_tag ltag(0.85, 15, 0, 200);
  \end{lstlisting}
- and return the permutation array. In {\ViennaCLversion}, the user then needs to manually reorder the sparse matrix based on the permutation array. Example code
- can be found in \lstinline|examples/tutorial/bandwidth-reduction.cpp|.
  
+ \TIP{Example code can be found in \lstinline|examples/tutorial/lanczos.cpp|.}
  
 +\section{Eigenvalue Computations}
 +%{\ViennaCL} 
 +Two algorithms for the computations of the eigenvalues of a matrix $A$ are implemented in {\ViennaCL}:
 +\begin{itemize}
 +\item The Power Iteration \cite{golub:matrix-computations}
 +\item The Lanczos Algorithm \cite{simon:lanczos-pro}
 +\end{itemize}
 +Depending on the parameter \lstinline|tag| either one of them is called. 
 +Both algorithms can be used for either {\ublas} or {\ViennaCL} compressed matrices.\\
 +In order to get the eigenvalue with the greatest absolut value the power iteration should be called. \\
 +The lanczos algorithm returns a vector of the largest eigenvalues with the same type as the entries of the matrix.
 +
 +The algorithms are called for a matrix object \lstinline|A| by
 +\begin{lstlisting}
 +std::vector<double> largest_eigenvalues = viennacl::linalg::eig(A, ltag);
 +double largest_eigenvalue = viennacl::linalg::eig(A, ptag);
 +\end{lstlisting}
 +
 +
 +\subsection{Power Iteration}
 +The Power iteration aims at computing the eigenvalues of a matrix by calculating the product of the matrix and a vector for several times, where the resulting vector is used for the next product of the matrix and so on. The computation stops as soon as the norm of the vector converges. \\
 +The final vector is the eigenvector to the eigenvalue with the greatest absolut value.\\
 +To call this algorithm, \lstinline|piter_tag| must be used.
 +This tag has only one parameter: \\ \lstinline|terminationfactor| defines the accuracy of the computation, i.e. if the new norm of the eigenvector changes less than this parameter the computation stops and returns the corresponding eigenvalue (default: $1e-10$).\\
 +The call of the constructor may look like the following:
 +\begin{lstlisting} 
 +viennacl::linalg::piter_tag ptag(1e-8);
 +\end{lstlisting}
 +
 +\TIP{Example code can be found in \lstinline|examples/tutorial/power-iter.cpp|.}
 +
 +\subsection{The Lanczos Algorithm}
 +In order to compute the eigenvalues of a sparse high-dimensional matrix the lanczos algorithm can be used to find these. 
 +This algorithm reformulates the given high-dimensional matrix in a way such that the matrix can be rewritten in a tridiagonal matrix at much lower dimension. The eigenvalues of this tridiagonal matrix are equal to the largest eigenvalues of the original matrix. \\
 +The eigenvalues of the tridiagonal matrix are calculated by using the bisection method \cite{golub:matrix-computations}. \\
 +To call this lanczos algorithm, \lstinline|lanczos_tag| must be used.
 +This tag has several parameters that can be passed to the constructor:
 +
 +\begin{itemize}
 + \item The exponent of epsilon for the tolerance of the reorthogonalization, defined by the parameter \lstinline|factor| (default: $0.75$)
 + \item The method of the lanczos algorithm: $0$ uses partial reorthogonalization, $1$ full reothogonalization and $2$ does not use reorthogonalization (default: $0$)
 + \item The number of eigenvalues that are returned is specified by \lstinline|num_eigenvalues| (default: $10$)
 + \item The size of the krylov space used for the computations can be set by the parameter \lstinline|krylov_size| (default: $100$). The maximum number of iterations can be equal or less this parameter
 +\end{itemize}
 +The call of the constructor may look like the following:
 +\begin{lstlisting}
 +viennacl::linalg::lanczos_tag ltag(0.85, 15, 0, 200);
 +\end{lstlisting}
 +
 +\TIP{Example code can be found in \lstinline|examples/tutorial/lanczos.cpp|.}
 +
  
  \section{QR Factorization}
++<<<<<<< HEAD
++=======
+ 
+ \NOTE{The current QR factorization implementation depends on {\ublas}.}
++>>>>>>> upstream/1.5.1
  
  A matrix $A \in \mathbb{R}^{n\times m}$ can be factored into $A = Q R$, where $Q \in \mathbb{R}^{n\times n}$ is an
  orthogonal matrix and $R \in \mathbb{R}^{n \times m}$ is upper triangular. This so-called QR-factorization is important for eigenvalue computations as well as
@@@ -497,53 -347,21 +435,68 @@@ worker function \lstinline|inplace_qr|
  \begin{lstlisting}
    std::vector<ScalarType> betas = viennacl::linalg::inplace_qr(A, 12);
  \end{lstlisting}
- If $A$ is a dense matrix from \ublas, the calculation is carried out on the CPU using a single thread. If $A$ is a 
+ If $A$ is a dense matrix from \ublas, the calculation is carried out on the CPU using a single thread. If $A$ is a
  \lstinline|viennacl::matrix|, a hybrid implementation is used: The panel factorization is carried out using \ublas, while expensive BLAS level 3 operations
++<<<<<<< HEAD
 +are computed on the OpenCL device using multiple threads. 
++=======
+ are computed on the OpenCL device using multiple threads.
++>>>>>>> upstream/1.5.1
  
  Typically, the orthogonal matrix $Q$ is kept in inplicit form because of computational efficiency
  However, if $Q$ and $R$ have to be computed explicitly, the function \lstinline|recoverQ| can be used:
  \begin{lstlisting}
-   viennacl::linalg::recoverQ(A, betas, Q, R); 
+   viennacl::linalg::recoverQ(A, betas, Q, R);
  \end{lstlisting}
  Here, \lstinline|A| is the inplace QR-factored matrix, \lstinline|betas| are the coefficients of the Householder reflectors as returned by
++<<<<<<< HEAD
 +\lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices.
 +
 +
 +\section{Singular Value Decomposition}
 +\NOTE{Singular Value Decomposition is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
 +be included in future releases!}
 +
 +\NOTE{Singular Value Decomposition in {\ViennaCLversion} is provided for a row-major matrix $A$ with single precision floating point entries (\lstinline|float|) only.}
 +
 +Any matrix $A$ can be factored as
 +\begin{align}
 + A = U \Sigma V^{\mathrm{T}}
 +\end{align}
 +with orthogonal matrices $U$ and $V$ and a diagonal matrix $\Sigma$ consisting of non-negative diagonal entries only.
 +
 +\begin{lstlisting}
 + viennacl::matrix<ScalarType> A(size1, size2),
 + viennacl::matrix<ScalarType> U(size1, size1),
 + viennacl::matrix<ScalarType> V(size2, size2),
 +
 + viennacl::linalg::svd(A, U, V);
 +\end{lstlisting}
 +The input matrix \lstinline|A| is overwritten with $\Sigma$.
 +
 +\section{Nonnegative Matrix Factorization}
 +\NOTE{Nonnegative Matrix Factorization is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
 +be included in future releases!}
 +
 +In various fields such as text mining, a matrix $V$ needs to be factored into factors $W$ and $H$ such that the function
 +\begin{align*}
 + f(W, H) = \Vert V - WH \Vert_{\mathrm{F}}^2
 +\end{align*}
 +is minimized. The algorithm proposed by Lee and Seoung \cite{lee:nmf} is available in ViennaCL as
 +\begin{lstlisting}
 + viennacl::matrix<ScalarType> V(size1, size2),
 + viennacl::matrix<ScalarType> W(size1, size1),
 + viennacl::matrix<ScalarType> H(size2, size2),
 +
 + viennacl::linalg::nmf(V, W, H);
 +\end{lstlisting}
++=======
+ \lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices. However, the explicit formation of $Q$ is expensive and is usually avoided.
+ For a number of applications of the QR factorization it is required to apply $Q^T$ to a vector $b$. This is accomplished by
+ \begin{lstlisting}
+  viennacl::linalg::inplace_qr_apply_trans_Q(A, betas, b);
+ \end{lstlisting}
+ without setting up $Q$ (or $Q^T$) explicitly.
+ 
+ \TIP{Have a look at \lstinline|examples/tutorial/least-squares.cpp| for a least-squares computation using QR factorizations.}
++>>>>>>> upstream/1.5.1
diff --cc doc/manual/changelogs.tex
index 8137e0f,1daaafe..31ba2b8
--- a/doc/manual/changelogs.tex
+++ b/doc/manual/changelogs.tex
@@@ -1,16 -1,169 +1,190 @@@
  
- \chapter*{Change Logs} \addcontentsline{toc}{chapter}{Change Logs}
+ \chapter{Change Logs} %\addcontentsline{toc}{chapter}{Change Logs}
+ 
+ \section*{Version 1.5.x}
+ 
+ \subsection*{Version 1.5.1}
+ This maintenance release fixes a few nasty bugs:
+ \begin{itemize}
+  \item Fixed a memory leak in the OpenCL kernel generator. Thanks to GitHub user dxyzab for spotting this.
+  \item Added compatibility of the mixed precision CG implementation with older AMD GPUs. Thanks to Andreas Rost for the input.
+  \item Fixed an error when running the QR factorization for matrices with less rows than columns. Thanks to Karol Polko for reporting.
+  \item Readded accidentally removed chapters on additional algorithms and structured matrices to the manual. Thanks to Sajjadul Islam for the hint.
+  \item Fixed buggy OpenCL kernels for matrix additions and subtractions for column-major matrices. Thanks to Tom Nicholson for reporting.
+  \item Fixed an invalid default kernel parameter set for matrix-matrix multiplications on CPUs when using the OpenCL backend. Thanks again to Tom Nicholson.
+  \item Corrected a weak check used in two tests. Thanks to Walter Mascarenhas for providing a fix.
+  \item Fixed a wrong global work size inside the SPAI preconditioner. Thanks to Andreas Rost.
+ \end{itemize}
+ 
+ \subsection*{Version 1.5.0}
+ This new minor release number update focuses on a more powerful API, and on first steps in making ViennaCL more accessible from languages other than C++.
+ In addition to many internal improvements both in terms of performance and flexibility, the following changes are visible to users:
+ \begin{itemize}
+  \item API-change: User-provided OpenCL kernels extract their kernels automatically. A call to \lstinline|add_kernel()| is now obsolete, hence the function was removed.
+  \item API-change: Device class has been extend and supports all informations defined in the OpenCL 1.1 standard through member functions. Duplicate \lstinline|compute_units()| and \lstinline|max_work_group_size()| have been removed (thanks for Shantanu Agarwal for the input).
+  \item API-change: \lstinline|viennacl::copy()| from a ViennaCL object to an object of non-ViennaCL type no longer tries to resize the object accordingly. An assertion is thrown if the sizes are incorrect in order to provide a consistent behavior across many different types.
+  \item Datastructure change: Vectors and matrices are now padded with zeros by default, resulting in higher performance particularly for matrix operations. This padding needs to be taken into account when using \lstinline|fast_copy()|, particularly for matrices.
+  \item Fixed problems with CUDA and CMake+CUDA on Visual Studio.
+  \item \lstinline|coordinate_matrix<>| now also behaves correctly for tiny matrix dimensions.
+  \item CMake 2.6 as new minimum requirement instead of CMake 2.8.
+  \item Vectors and matrices can be instantiated with integer template types (long, int, short, char).
+  \item Added support for \lstinline|element_prod()| and \lstinline|element_div()| for dense matrices.
+  \item Added \lstinline|element_pow()| for vectors and matrices.
+  \item Added \lstinline|norm_frobenius()| for computing the Frobenius norm of dense matrices.
+  \item Added unary element-wise operations for vectors and dense matrices: \lstinline|element_sin()|, \lstinline|element_sqrt()|, etc.
+  \item Multiple OpenCL contexts can now be used in a multi-threaded setting (one thread per context).
+  \item Multiple inner products with a common vector can now be computed efficiently via e.g.~\lstinline|inner_prod(x, tie(y, z));|
+  \item Added support for \lstinline|prod(A, B)|, where \lstinline|A| is a sparse matrix type and \lstinline|B| is a dense matrix (thanks to Albert Zaharovits for providing parts of the implementation).
+  \item Added \lstinline|diag()| function for extracting the diagonal of a vector to a matrix, or for generating a square matrix from a vector with the vector elements on a diagonal (similar to MATLAB).
+  \item Added \lstinline|row()| and \lstinline|column()| functions for extracting a certain row or column of a matrix to a vector.
+  \item Sparse matrix-vector products now also work with vector strides and ranges.
+  \item Added \lstinline|async_copy()| for vectors to allow for a better overlap of computation and communication.
+  \item Added \lstinline|compressed_compressed_matrix| type for the efficient representation of CSR matrices with only few nonzero rows.
+  \item Added possibility to switch command queues in OpenCL contexts.
+  \item Improved performance of Block-ILU by removing one spurious conversion step.
+  \item Improved performance of Cuthill-McKee algorithm by about 40 percent.
+  \item Improved performance of power iteration by avoiding the creation of temporaries in each step.
+  \item Removed spurious status message to cout in matrix market reader and nonnegative matrix factorization.
+  \item The OpenCL kernel launch logic no longer attempts to re-launch the kernel with smaller work sizes if an error is encountered (thanks to Peter Burka for pointing this out).
+  \item Reduced overhead for lenghty expressions involving temporaries (at the cost of increased compilation times).
+  \item \lstinline|vector| and \lstinline|matrix| are now padded to dimensions being multiples of 128 per default. This greatly improves GEMM performance for arbitrary sizes.
+  \item Loop indices for OpenMP parallelization are now all signed, increasing compatibility with older OpenMP implementations (thanks to Mrinal Deo for the hint).
+  \item Complete rewrite of the generator. Now uses the scheduler for specifying the operation. Includes a full device database for portable high performance of GEMM kernels.
+  \item Added micro-scheduler for attaching the OpenCL kernel generator to the user API.
+  \item Certain BLAS functionality in ViennaCL is now also available through a shared library (libviennacl).
+  \item Removed the external kernel parameter tuning factility, which is to be replaced by an internal device database through the kernel generator.
+  \item Completely eliminated the OpenCL kernel conversion step in the developer repository and the source-release. One can now use the developer version without the need for a Boost installation.
+ \end{itemize}
+ 
+ 
+ \section*{Version 1.4.x}
+ 
+ \subsection*{Version 1.4.2}
+ This is a maintenance release, particularly resolving compilation problems with Visual Studio 2012.
+ \begin{itemize}
+  \item Largely refactored the internal code base, unifying code for \lstinline|vector|, \lstinline|vector_range|, and \lstinline|vector_slice|.
+        Similar code refactoring was applied to \lstinline|matrix|, \lstinline|matrix_range|, and \lstinline|matrix_slice|.
+        This not only resolves the problems in VS 2012, but also leads to shorter compilation times and a smaller code base.
+  \item Improved performance of matrix-vector products of \lstinline|compressed_matrix| on CPUs using OpenCL.
+  \item Resolved a bug which shows up if certain rows and columns of a \lstinline|compressed_matrix| are empty and the matrix is copied back to host.
+  \item Fixed a bug and improved performance of GMRES. Thanks to Ivan Komarov for reporting via sourceforge.
+  \item Added additional Doxygen documentation.
+ \end{itemize}
+ 
+ 
+ \subsection*{Version 1.4.1}
+ This release focuses on improved stability and performance on AMD devices rather than introducing new features:
+ \begin{itemize}
+  \item Included fast matrix-matrix multiplication kernel for AMD's Tahiti GPUs if matrix dimensions are a multiple of 128.
+        Our sample HD7970 reaches over 1.3 TFLOPs in single precision and 200 GFLOPs in double precision (counting multiplications and additions as separate operations).
+  \item All benchmark FLOPs are now using the common convention of counting multiplications and additions separately (ignoring fused multiply-add).
+  \item Fixed a bug for matrix-matrix multiplication with \lstinline|matrix_slice<>| when slice dimensions are multiples of 64.
+  \item Improved detection logic for Intel OpenCL SDK.
+  \item Fixed issues when resizing an empty \lstinline|compressed_matrix|.
+  \item Fixes and improved support for BLAS-1-type operations on dense matrices and vectors.
+  \item Vector expressions can now be passed to \lstinline|inner_prod()| and \lstinline|norm_1()|, \lstinline|norm_2()| and \lstinline|norm_inf()| directly.
+  \item Improved performance when using OpenMP.
+  \item Better support for Intel Xeon Phi (MIC).
+  \item Resolved problems when using OpenCL for CPUs if the number of cores is not a power of 2.
+  \item Fixed a flaw when using AMG in debug mode. Thanks to Jakub Pola for reporting.
+  \item Removed accidental external linkage (invalidating header-only model) of SPAI-related functions. Thanks again to Jakub Pola.
+  \item Fixed issues with copy back to host when OpenCL handles are passed to CTORs of vector, matrix, or \lstinline|compressed_matrix|. Thanks again to Jakub Pola.
+  \item Added fix for segfaults on program exit when providing custom OpenCL queues. Thanks to Denis Demidov for reporting.
+  \item Fixed bug in \lstinline|copy()| to \lstinline|hyb_matrix| as reported by Denis Demidov (thanks!).
+  \item Added an overload for \lstinline|result_of::alignment| for \lstinline|vector_expression|. Thanks again to Denis Demidov.
+  \item Added SSE-enabled code contributed by Alex Christensen.
+ \end{itemize}
+ 
+ 
+ 
+ \subsection*{Version 1.4.0}
+ The transition from 1.3.x to 1.4.x features the largest number of additions, improvements, and cleanups since the initial release.
+ In particular, host-, OpenCL-, and CUDA-based execution is now supported. OpenCL now needs to be enabled explicitly!
+ New features and feature improvements are as follows:
+ \begin{itemize}
+  \item Added host-based and CUDA-enabled operations on ViennaCL objects. The default is now a host-based execution for reasons of compatibility.
+        Enable OpenCL- or CUDA-based execution by defining the preprocessor constant \lstinline|VIENNACL_WITH_OPENCL| and \lstinline|VIENNACL_WITH_CUDA| respectively.
+        Note that CUDA-based execution requires the use of nvcc.
+  \item Added mixed-precision CG solver (OpenCL-based).
+  \item Greatly improved performance of ILU0 and ILUT preconditioners (up to 10-fold). Also fixed a bug in ILUT.
+  \item Added initializer types from Boost.uBLAS (\lstinline|unit_vector|, \lstinline|zero_vector|, \lstinline|scalar_vector|, \lstinline|identity_matrix|, \lstinline|zero_matrix|, \lstinline|scalar_matrix|).
+        Thanks to Karsten Ahnert for suggesting the feature.
+  \item Added incomplete Cholesky factorization preconditioner.
+  \item Added element-wise operations for vectors as available in Boost.uBLAS (\lstinline|element_prod|, \lstinline|element_div|).
+  \item Added restart-after-N-cycles option to BiCGStab.
+  \item Added level-scheduling for ILU-preconditioners. Performance strongly depends on matrix pattern.
+  \item Added least-squares example including a function \lstinline|inplace_qr_apply_trans_Q()| to compute the right hand side vector $Q^T b$ without rebuilding $Q$.
+  \item Improved performance of LU-factorization of dense matrices.
+  \item Improved dense matrix-vector multiplication performance (thanks to Philippe Tillet).
+  \item Reduced overhead when copying to/from \lstinline|ublas::compressed_matrix|.
+  \item ViennaCL objects (scalar, vector, etc.) can now be used as global variables (thanks to an anonymous user on the support-mailinglist).
+  \item Refurbished OpenCL vector kernels backend.
+        All operations of the type v1 = a v2 @ b v3 with vectors v1, v2, v3 and scalars a and b including += and -= instead of = are now temporary-free. Similarly for matrices.
+  \item \lstinline|matrix_range| and \lstinline|matrix_slice| as well as \lstinline|vector_range| and \lstinline|vector_slice| can now be used and mixed completely seamlessly with all standard operations except \lstinline|lu_factorize()|.
+  \item Fixed a bug when using copy() with iterators on vector proxy objects.
+  \item Final reduction step in \lstinline|inner_prod()| and norms is now computed on CPU if the result is a CPU scalar.
+  \item Reduced kernel launch overhead of simple vector kernels by packing multiple kernel arguments together.
+  \item Updated SVD code and added routines for the computation of symmetric eigenvalues using OpenCL.
+  \item \lstinline|custom_operation|'s constructor now support multiple arguments, allowing multiple expression to be packed in the same kernel for improved performances. However, all the datastructures in the multiple operations must have the same size.
+  \item Further improvements to the OpenCL kernel generator: Added a repeat feature for generating loops inside a kernel,
+        added element-wise products and division, added support for every one-argument OpenCL function.
+  \item The name of the operation is now a mandatory argument of the constructor of \lstinline|custom_operation|.
+  \item Improved performances of the generated matrix-vector product code.
+  \item Updated interfacing code for the Eigen library, now working with Eigen 3.x.y.
+  \item Converter in source-release now depends on Boost.filesystem3 instead of Boost.filesystem2, thus requiring Boost 1.44 or above.
+ \end{itemize}
+ 
+ 
+ 
+ 
+ 
+ \section*{Version 1.3.x}
+ 
+ \subsection*{Version 1.3.1}
+ The following bugfixes and enhancements have been applied:
+ \begin{itemize}
+  \item Fixed a compilation problem with GCC 4.7 caused by the wrong order of function declarations. Also removed unnecessary indirections and unused variables.
+  \item Improved out-of-source build in the src-version (for packagers).
+  \item Added virtual destructor in the \lstinline|runtime_wrapper|-class in the kernel generator.
+  \item Extended flexibility of submatrix and subvector proxies (ranges, slices).
+  \item Block-ILU for \lstinline|compressed_matrix| is now applied on the GPU during the solver cycle phase. However, for the moment the implementation file in \newline \texttt{viennacl/linalg/detail/ilu/opencl\_block\_ilu.hpp} needs to be included separately in order to avoid an OpenCL dependency for all ILU implementations.
+  \item SVD now supports double precision.
+  \item Slighly adjusted the interface for NMF. The approximation rank is now specified by the supplied matrices $W$ and $H$.
+  \item Fixed a problem with matrix-matrix products if the result matrix is not initialized properly (thanks to Laszlo Marak for finding the issue and a fix).
+  \item The operations $C += prod(A, B)$ and $C -= prod(A, B)$ for matrices A, B, and C no longer introduce temporaries if the three matrices are distinct.
+ \end{itemize}
+ 
+ 
+ 
+ \subsection*{Version 1.3.0}
+ Several new features enter this new minor version release.
+ Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
+ \begin{itemize}
+  \item Full support for ranges and slices for dense matrices and vectors (no longer experimental)
+  \item QR factorization now possible for arbitrary matrix sizes (no longer experimental)
+  \item Further improved matrix-matrix multiplication performance for matrix dimensions which are a multiple of 64 (particularly improves performance for NVIDIA GPUs)
+  \item Added Lanczos and power iteration method for eigenvalue computations of dense and sparse matrices (experimental, contributed by G\"unther Mader and Astrid Rupp)
++ \item Added singular value decomposition in single precision (experimental, contributed by Volodymyr Kysenko)
++ \item Two new ILU-preconditioners added: ILU0 (contributed by Evan Bollig) and a block-diagonal ILU preconditioner using either ILUT or ILU0 for each block. Both preconditioners are computed entirely on the CPU.
++ \item Automated OpenCL kernel generator based on high-level operation specifications added (many thanks to Philippe Tillet who had a lot of \emph{fun fun fun} working on this)
++ \item Two new sparse matrix types (by Volodymyr Kysenko): \lstinline|ell_matrix| for the ELL format and \lstinline|hyb_matrix| for a hybrid format (contributed by Volodymyr Kysenko).
++ \item Added possibility to specify the OpenCL platform used by a context
++ \item Build options for the OpenCL compiler can now be supplied to a context (thanks to Krzysztof Bzowski for the suggestion)
++ \item Added nonnegative matrix factorization by Lee and Seoung (contributed by Volodymyr Kysenko).
++\end{itemize}
++
++
 +
 +\section*{Version 1.3.x}
 +
 +\subsection*{Version 1.3.0}
 +Several new features enter this new minor version release.
 +Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
 +\begin{itemize}
 + \item Full support for ranges and slices for dense matrices and vectors (no longer experimental)
 + \item QR factorization now possible for arbitrary matrix sizes (no longer experimental)
 + \item Further improved matrix-matrix multiplication performance for matrix dimensions which are a multiple of 64 (particularly improves performance for NVIDIA GPUs)
 + \item Added Lanczos and power iteration method for eigenvalue computations of dense and sparse matrices (experimental, contributed by G\"unther Mader and Astrid Rupp)
   \item Added singular value decomposition in single precision (experimental, contributed by Volodymyr Kysenko)
   \item Two new ILU-preconditioners added: ILU0 (contributed by Evan Bollig) and a block-diagonal ILU preconditioner using either ILUT or ILU0 for each block. Both preconditioners are computed entirely on the CPU.
   \item Automated OpenCL kernel generator based on high-level operation specifications added (many thanks to Philippe Tillet who had a lot of \emph{fun fun fun} working on this)
diff --cc doc/manual/contributors.tex
index cccfd0e,79f24f9..a1edba4
--- a/doc/manual/contributors.tex
+++ b/doc/manual/contributors.tex
@@@ -14,6 -17,7 +17,10 @@@ Karl Rupp\
  \textit{Code Contributors:} \\
  
  Evan Bollig \\
++<<<<<<< HEAD
++=======
+ Alex Christensen (BYU) \\
++>>>>>>> upstream/1.5.1
  Philipp Grabenweger \\
  Volodymyr Kysenko \\
  Nikolay Lukash \\
@@@ -21,6 -25,7 +28,10 @@@ G\"unther Mader \
  Vittorio Patriarca \\
  Florian Rudolf \\
  Astrid Rupp \\
++<<<<<<< HEAD
++=======
+ Toby St Clere Smithe \\
++>>>>>>> upstream/1.5.1
  Philippe Tillet \\
  Markus Wagner \\
  Josef Weinbub \\
diff --cc doc/manual/cover.tex
index 30a71ea,b9e5fe5..b902a56
--- a/doc/manual/cover.tex
+++ b/doc/manual/cover.tex
@@@ -2,7 -2,7 +2,11 @@@
  \begin{titlepage}
  
  \vspace*{3cm}
++<<<<<<< HEAD
 +\Huge{ViennaCL 1.3.0} 
++=======
+ \Huge{ViennaCL 1.5.1}
++>>>>>>> upstream/1.5.1
  \rule[0.0cm]{9.5cm}{0.05cm}
  \begin{flushright}
  \Large{User Manual}
diff --cc doc/manual/kernel-generation.tex
index 10289bb,f1d78a8..4b8c4b8
--- a/doc/manual/kernel-generation.tex
+++ b/doc/manual/kernel-generation.tex
@@@ -1,34 -1,39 +1,71 @@@
++<<<<<<< HEAD
 +\chapter{Automated User-Kernel Generation} \label{chap:kernel-generation}
 +
 +While {\ViennaCL} provides a convenient means of including custom compute kernels, cf.~Chap.~\ref{chap:custom},
++=======
+ \chapter{Automated OpenCL User-Kernel Generation} \label{chap:kernel-generation}
+ 
+ While {\ViennaCL} provides a convenient means of including custom {\OpenCL} compute kernels, cf.~Chap.~\ref{chap:custom},
++>>>>>>> upstream/1.5.1
  it can be rather tedious to come up with a good compute kernel, or to come up with many similar kernels differing in small details only.
  For the case of BLAS level 1 and level 2 operations, {\ViennaCL} now provides an automated kernel generator, which takes a high-level specification of the operations and create one or more suitable OpenCL kernels.
  This allows for high-performance implementations of algorithms which may otherwise lead to spurious temporary objects.
  
++<<<<<<< HEAD
 +As our second example, we consider the operation
++=======
+ Consider the operation
++>>>>>>> upstream/1.5.1
  \begin{align*}
  \mathbf{x} = \mathbf{A} \times \bigl[ (\mathbf{y} \cdot (\mathbf{y}+\mathbf{z}))\mathbf{y} + \mathbf{z} \bigr] \ ,
  \end{align*}
  where $\mathbf{x}$, $\mathbf{y}$ and $\mathbf{z}$ denote vectors, $\mathbf{A}$ is a dense matrix, and the dot denotes the vector dot product.
++<<<<<<< HEAD
 +With the proposed generator it is sufficient to write the following C++ code:
 +\begin{lstlisting}
 +// Instantiation of the symbolic variables
 +symbolic_vector<0, NumericT> sX;
 +symbolic_matrix<1, NumericT> sA;
 +symbolic_vector<2, NumericT> sY;
 +symbolic_vector<3, NumericT> sZ;
 +
 +//Creation of the custom operation
 +custom_operation my_op(
 + sX = prod(sA, inner_prod(sY, sY+sZ) * sY + sZ)
 +                      );
 +\end{lstlisting}
 +where \lstinline|NumericT| is either \lstinline|float| or \lstinline|double|.
 +The custom operation object \lstinline|my_op| can then be enqueued like any other kernel:
++=======
+ With the generator it is sufficient to write the following C++ code in order to obtain an OpenCL kernel:
+ \begin{lstlisting}
+ // Instantiation of the symbolic variables
+ symbolic_vector<NumericT, 0> sX;
+ symbolic_matrix<NumericT, 1> sA;
+ symbolic_vector<NumericT, 2> sY;
+ symbolic_vector<NumericT, 3> sZ;
+ 
+ //Creation of the custom operation
+ custom_operation my_op( sX = prod(sA, inner_prod(sY, sY+sZ) * sY + sZ),
+                         "operation_name" );
+ \end{lstlisting}
+ where \lstinline|NumericT| is either \lstinline|float| or \lstinline|double|.
+ The string provided as second parameter is required and can be used to identify, manage and retrieve different kernels.
+ No two \lstinline|custom_operation|s are allowed to be identified using the same string.
+ 
+ The custom operation object \lstinline|my_op| can be enqueued like any other kernel:
++>>>>>>> upstream/1.5.1
  \begin{lstlisting}
  //Execution of the custom operation
  viennacl::ocl::enqueue(my_op(x,A,y,z));
  \end{lstlisting}
  Here, \lstinline|x|, \lstinline|y|, \lstinline|z| are of type \lstinline|viennacl::vector<NumericT>| and \lstinline|A| is of type \lstinline|viennacl::matrix<NumericT>|.
  
- \TIP{Sample code can be found in \lstinline|tests/src/generator_*.cpp|}
++<<<<<<< HEAD
++\TIP{Sample code can be found in \lstinline|tests/src/generator_*.cpp|}
++=======
+ \TIP{Sample code can be found in \lstinline|tests/src/generator_*.cpp|}
+ 
+ \NOTE{ The kernel generator is still experimental, yet already able to generate rather complex compute kernels. }
+ 
++>>>>>>> upstream/1.5.1
diff --cc doc/manual/multi-device.tex
index 6449ef4,f8ce6d3..e441ebf
--- a/doc/manual/multi-device.tex
+++ b/doc/manual/multi-device.tex
@@@ -19,7 -21,7 +21,11 @@@ This default context is identified by t
  If a different platform should be used on a machine with multiple platforms available,
  this can be achieved with
  \begin{lstlisting}
++<<<<<<< HEAD
 + viennacl::ocl::setup_context_platform_index(id, platform_index);
++=======
+  viennacl::ocl::set_context_platform_index(id, platform_index);
++>>>>>>> upstream/1.5.1
  \end{lstlisting}
  where the context ID is \lstinline|id| and \lstinline|platform_index| refers to the array index of the platform as returned by \lstinline|clGetPlatformIDs()|.
  
@@@ -28,8 -30,8 +34,13 @@@ By default, only the first device in th
   viennacl::ocl::current_context().current_device();
   viennacl::ocl::current_device(); //equivalent to above
  \end{lstlisting}
++<<<<<<< HEAD
 +A user may wish to use multiple contexts, where each context consists of a subset of the available devices. 
 +To setup a context with ID \lstinline|id| with a particular device type only, the user has to specify this 
++=======
+ A user may wish to use multiple {\OpenCL} contexts, where each context consists of a subset of the available devices.
+ To setup a context with ID \lstinline|id| with a particular device type only, the user has to specify this
++>>>>>>> upstream/1.5.1
  prior to any other {\ViennaCL} related statements:
  \begin{lstlisting}
  //use only GPUs:
@@@ -89,11 -91,11 +100,19 @@@ If the supplied device is not part of t
  
  
  \section{Setting OpenCL Compiler Flags}
++<<<<<<< HEAD
 +Each context provides a member function \lstinline|.build_options()|, which can be used to pass OpenCL compiler flags prior to compilation.
++=======
+ Each {\OpenCL} context provides a member function \lstinline|.build_options()|, which can be used to pass OpenCL compiler flags prior to compilation.
++>>>>>>> upstream/1.5.1
  Note that flags need to be passed to the context prior to the compilation of the respective kernels, i.e.~prior the first instantiation of the respective matrix or vector types.
  
  To pass the \lstinline|-cl-mad-enable| flag to the current context, the line
  \begin{lstlisting}
   viennacl::ocl::current_context().build_options("-cl-mad-enable");
  \end{lstlisting}
- is sufficient. Confer to the {\OpenCL} standard for a full list of flags.
++<<<<<<< HEAD
+ is sufficient. Confer to the {\OpenCL} standard for a full list of flags.
++=======
++is sufficient. Confer to the {\OpenCL} standard for a full list of flags.
++>>>>>>> upstream/1.5.1
diff --cc doc/manual/types.tex
index b7254a1,ecb31a9..8c1edfc
--- a/doc/manual/types.tex
+++ b/doc/manual/types.tex
@@@ -457,10 -355,10 +355,14 @@@ The use of \texttt{coordinate\_matrix$<
  
  
  \subsubsection{Members}
- The interface is described in Tab.~\ref{tab:coordinate-matrix-interface}. 
+ The interface is described in Tab.~\ref{tab:coordinate-matrix-interface}.
  
  %\TIP{In {\ViennaCLversion} the use of \lstinline|compressed\_matrix| over \lstinline|coordinate\_matrix| is encouraged due to better performance!}
++<<<<<<< HEAD
 +\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|coordinate_matrix| yet.}
++=======
+ \NOTE{Note that only a few preconditioners work with \lstinline|coordinate_matrix| so far, cf.~ Sec.~\ref{sec:preconditioner}.}
++>>>>>>> upstream/1.5.1
  
  
  \subsection{ELL Matrix}
@@@ -478,10 -376,18 +380,25 @@@ For an example use of an \lstinline|ell
  \subsection{Hybrid Matrix}
  The higher performance of the ELL format for matrices with approximately the same number of entries per row
  and the higher flexibility of the CSR format is combined in the \lstinline|hyb_matrix| type, where the main part of the system matrix is stored in ELL format and excess entries are stored in CSR format.
++<<<<<<< HEAD
 +
 +For an example use of an \lstinline|hyb_matrix|, have a look at \lstinline|examples/benchmarks/sparse.cpp|.
 +
 +\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|hyb_matrix| yet.}
++=======
+ 
+ For an example use of an \lstinline|hyb_matrix|, have a look at \lstinline|examples/benchmarks/sparse.cpp|.
+ 
+ \NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|hyb_matrix| yet.}
+ 
+ \subsection{Compressed Compressed Matrix}
+ If only a few rows of a sparse matrix are populated, then the previous sparse matrix formats are fairly expensive in terms of memory consumption.
+ This is addressed by the \lstinline|compressed_compressed_matrix<>| format, which is similar to the standard CSR format, but only stores the rows containing nonzero elements.
+ An additional array is used to store the global row index $r$ in the sparse matrix $A$ of the $i$-th nonzero row.
+ 
+ \NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|compressed_compressed_matrix| yet.}
+ 
++>>>>>>> upstream/1.5.1
  
  \section{Proxies}
  Similar to {\ublas}, {\ViennaCL} provides \lstinline|range| and \lstinline|slice| objects in order to conveniently manipulate dense submatrices and vectors. The functionality is
@@@ -521,7 -427,7 +438,11 @@@ The proxy objects can now be manipulate
  additions work as usual, e.g.
  \begin{lstlisting}
   vcl_sub += vcl_sub; //or project(v, r) += project(v, r);
++<<<<<<< HEAD
 + M_sub += M_sub;     //or project(M, r, r) += project(M, r, r);
++=======
+  M_sub   += M_sub;   //or project(M, r, r) += project(M, r, r);
++>>>>>>> upstream/1.5.1
  \end{lstlisting}
   Submatrix-Submatrix products are computed in the same manner and are handy for many block-based linear algebra algorithms.
  
diff --cc doc/manual/viennacl.bib
index c5bbcdf,bdda2c3..cf9fd32
--- a/doc/manual/viennacl.bib
+++ b/doc/manual/viennacl.bib
@@@ -167,4 -172,4 +172,8 @@@
   booktitle = {Advances in Neural Information Processing Systems 13},
   pages = {556–562},
   year = {2000},
++<<<<<<< HEAD
 +} 
++=======
+ }
++>>>>>>> upstream/1.5.1
diff --cc doc/manual/viennacl.tex
index 2370e8f,85c60f8..5a39d16
--- a/doc/manual/viennacl.tex
+++ b/doc/manual/viennacl.tex
@@@ -55,11 -59,12 +59,17 @@@
  \newcommand{\CMake} {\texttt{CMake}}
  \newcommand{\OpenMP} {\texttt{OpenMP}}
  \newcommand{\OpenCL} {\texttt{OpenCL}}
+ \newcommand{\CUDA} {\texttt{CUDA}}
  \newcommand{\ViennaCL} {\texttt{ViennaCL}}
++<<<<<<< HEAD
 +\newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.3.0}}
 +\newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.3.x}}
++=======
+ \newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.5.1}}
+ \newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.5.x}}
++>>>>>>> upstream/1.5.1
  \newcommand{\Boost} {\texttt{Boost}}
- \newcommand{\ublas} {\texttt{ublas}}
+ \newcommand{\ublas} {\texttt{uBLAS}}
  \newcommand{\Eigen} {\texttt{Eigen}}
  \newcommand{\MTL} {\texttt{MTL 4}}
  \newcommand{\GCC} {\texttt{GCC}}
@@@ -90,11 -133,22 +138,29 @@@ library users are advised to use them w
  \include{multi-device}
  \include{custom-kernels}
  \include{custom-contexts}
++<<<<<<< HEAD
 +\include{kernel-generation}
 +\include{tuning}
 +\include{other-libs}
 +\include{benchmarks}
++=======
+ %\include{kernel-generation}
+ %\include{tuning}
+ \include{structured-matrices}
+ 
+ 
+ %%%%%%%%%%%%%%% Addon Functionality %%%%%%%%%%%%%%%%
+ 
+ \part{Miscellaneous}
++>>>>>>> upstream/1.5.1
  \include{design}
+ 
+ % Appendix
+ %\appendix
+ %\appendixpage
+ %\addappheadtotoc
+ 
+ \begin{appendices}
  \include{versioning}
  \include{changelogs}
  \include{license}
diff --cc examples/benchmarks/CMakeLists.txt
index b2171a8,0e880c7..242acf3
--- a/examples/benchmarks/CMakeLists.txt
+++ b/examples/benchmarks/CMakeLists.txt
@@@ -1,16 -1,57 +1,68 @@@
- foreach(bench blas3 opencl vector)
-    add_executable(${bench}bench ${bench}.cpp)
-    target_link_libraries(${bench}bench ${OPENCL_LIBRARIES})
+ # Targets using CPU-based execution
+ foreach(bench blas3 copy scheduler vector)
+    add_executable(${bench}bench-cpu ${bench}.cpp)
  endforeach()
  
++<<<<<<< HEAD
 +if(ENABLE_UBLAS)
 +   include_directories(${Boost_INCLUDE_DIRS})
 +   foreach(bench sparse solver)
 +      add_executable(${bench}bench ${bench}.cpp)
 +      target_link_libraries(${bench}bench ${OPENCL_LIBRARIES})
 +   endforeach()
 +endif()
 +
++=======
+ if (ENABLE_UBLAS)
+     include_directories(${Boost_INCLUDE_DIRS})
+     foreach(bench sparse solver)
+       add_executable(${bench}bench-cpu ${bench}.cpp)
+       target_link_libraries(${bench}bench-cpu ${Boost_LIBRARIES})
+     endforeach()
+ endif (ENABLE_UBLAS)
+ 
+ 
+ # Targets using OpenCL
+ if (ENABLE_OPENCL)
+ 
+   foreach(bench blas3 copy
+           generator_blas1 generator_blas2 generator_blas3
+           opencl vector)
+     add_executable(${bench}bench-opencl ${bench}.cpp)
+     target_link_libraries(${bench}bench-opencl ${OPENCL_LIBRARIES})
+     set_target_properties(${bench}bench-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+   endforeach()
+ 
+   if (ENABLE_UBLAS)
+      include_directories(${Boost_INCLUDE_DIRS})
+      foreach(bench sparse solver)
+        add_executable(${bench}bench-opencl ${bench}.cpp)
+        target_link_libraries(${bench}bench-opencl ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+        set_target_properties(${bench}bench-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+      endforeach()
+   endif (ENABLE_UBLAS)
+ 
+ endif (ENABLE_OPENCL)
+ 
+ # Targets using CUDA
+ if (ENABLE_CUDA)
+ 
+   foreach(bench blas3 copy vector)
+      cuda_add_executable(${bench}bench-cuda ${bench}.cu)
+   endforeach()
+ 
+   if (ENABLE_UBLAS)
+      include_directories(${Boost_INCLUDE_DIRS})
+      foreach(bench sparse solver)
+        cuda_add_executable(${bench}bench-cuda ${bench}.cu)
+       target_link_libraries(${bench}bench-cuda ${Boost_LIBRARIES})
+      endforeach()
+   endif (ENABLE_UBLAS)
+ 
+ endif (ENABLE_CUDA)
+ 
+ 
++>>>>>>> upstream/1.5.1
  # IF(CMAKE_COMPILER_IS_GNUCXX)
     #ADD_DEFINITIONS(-Wall -pedantic -O0 -g)
  #   ADD_DEFINITIONS(-Wall -pedantic -O3)
diff --cc examples/benchmarks/blas3.cpp
index 3945d13,128c8b8..95e0447
--- a/examples/benchmarks/blas3.cpp
+++ b/examples/benchmarks/blas3.cpp
@@@ -1,200 -1,246 +1,449 @@@
++<<<<<<< HEAD
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +//disable debug mechanisms to have a fair benchmark environment
 +#ifndef NDEBUG
 + #define NDEBUG
 +#endif
 +
 +//
 +// include necessary system headers
 +//
 +#include <iostream>
 +
 +//
 +// ViennaCL includes
 +//
 +#include "viennacl/scalar.hpp"
 +#include "viennacl/vector.hpp"
 +#include "viennacl/matrix.hpp"
 +#include "viennacl/linalg/prod.hpp"
 +#include "viennacl/matrix_proxy.hpp"
 +
 +// Some helper functions for this tutorial:
 +#include "../tutorial/Random.hpp"
 +
 +
 +#include "benchmark-utils.hpp"
 +
 +/*
 +*   Tutorial: BLAS level 3 functionality
 +*   
 +*/
 +
 +#define BLAS3_MATRIX_SIZE   1024
 +
 +template<typename ScalarType>
 +int run_benchmark()
 +{
 +  Timer timer;
 +  double exec_time;
 +
 +  //
 +  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
 +  //
 +  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
 +  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
 +  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
 +
 +  //
 +  // Fill the matrix
 +  //
 +  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
 +    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
 +      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();
 +
 +  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
 +    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
 +      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();
 +
 +  //
 +  // Set up some ViennaCL objects
 +  //
 +  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
 +  //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
 +  //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
 +  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
 +  viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
 +  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
 +  
 +  
 +  /////////////////////////////////////////////////
 +  //////////// Matrix-matrix products /////////////
 +  /////////////////////////////////////////////////
 +  
 +  //
 +  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
 +  //
 +  
 +  std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
 +  
 +  
 +  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
 +  for (size_t i=0; i<devices.size(); ++i)
 +  {
 +    viennacl::ocl::current_context().switch_device(devices[i]);
 +    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
 +
 +    viennacl::fast_copy(&(stl_A[0]),
 +                        &(stl_A[0]) + stl_A.size(),
 +                        vcl_A);
 +    viennacl::fast_copy(&(stl_B[0]),
 +                        &(stl_B[0]) + stl_B.size(),
 +                        vcl_B);
 +    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
 +    viennacl::ocl::get_queue().finish();
 +    timer.start();
 +    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
 +    viennacl::ocl::get_queue().finish();
 +    exec_time = timer.get();
 +    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
 +    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
 +    std::cout << std::endl;
 +  }
 +
 +  std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;
 +
 +  viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
 +  for (size_t i=0; i<devices.size(); ++i)
 +  {
 +    viennacl::ocl::current_context().switch_device(devices[i]);
 +    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
 +
 +    viennacl::fast_copy(&(stl_A[0]),
 +                        &(stl_A[0]) + stl_A.size(),
 +                        vcl_A);
 +    viennacl::fast_copy(&(stl_B[0]),
 +                        &(stl_B[0]) + stl_B.size(),
 +                        vcl_B);
 +    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
 +    viennacl::ocl::get_queue().finish();
 +    timer.start();
 +    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
 +    viennacl::ocl::get_queue().finish();
 +    exec_time = timer.get();
 +    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
 +    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
 +    std::cout << std::endl;
 +  }
 +
 +  std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;
 +
 +  viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
 +  for (size_t i=0; i<devices.size(); ++i)
 +  {
 +    viennacl::ocl::current_context().switch_device(devices[i]);
 +    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
 +
 +    viennacl::fast_copy(&(stl_A[0]),
 +                        &(stl_A[0]) + stl_A.size(),
 +                        vcl_A);
 +    viennacl::fast_copy(&(stl_B[0]),
 +                        &(stl_B[0]) + stl_B.size(),
 +                        vcl_B);
 +    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
 +    viennacl::ocl::get_queue().finish();
 +    timer.start();
 +    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
 +    viennacl::ocl::get_queue().finish();
 +    exec_time = timer.get();
 +    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
 +    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
 +    std::cout << std::endl;
 +  }
 +
 +  return EXIT_SUCCESS;
 +}
 +
 +int main()
 +{
 +  std::cout << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "               Device Info" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  
 +  std::cout << viennacl::ocl::current_device().info() << std::endl;
 +  
 +  
 +  std::cout << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "## Benchmark :: Dense Matrix-Matrix product " << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << std::endl;
 +  std::cout << "   -------------------------------" << std::endl;
 +  std::cout << "   # benchmarking single-precision" << std::endl;
 +  std::cout << "   -------------------------------" << std::endl;
 +  run_benchmark<float>();
 +  if( viennacl::ocl::current_device().double_support() )
 +  {
 +    std::cout << std::endl;
 +    std::cout << "   -------------------------------" << std::endl;
 +    std::cout << "   # benchmarking double-precision" << std::endl;
 +    std::cout << "   -------------------------------" << std::endl;
 +    run_benchmark<double>();
 +  }
 +  return 0;
 +}
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /*
+ *
+ *   Benchmark: BLAS level 3 functionality for dense matrices (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+ *
+ */
+ 
+ //disable debug mechanisms to have a fair benchmark environment
+ #ifndef NDEBUG
+  #define NDEBUG
+ #endif
+ 
+ //#define VIENNACL_DEBUG_ALL
+ //#define VIENNACL_DEBUG_BUILD
+ 
+ //
+ // include necessary system headers
+ //
+ #include <iostream>
+ 
+ //
+ // ViennaCL includes
+ //
+ #include "viennacl/scalar.hpp"
+ #include "viennacl/vector.hpp"
+ #include "viennacl/matrix.hpp"
+ #include "viennacl/linalg/prod.hpp"
+ #include "viennacl/matrix_proxy.hpp"
+ #include "viennacl/linalg/lu.hpp"
+ 
+ // Some helper functions for this tutorial:
+ #include "../tutorial/Random.hpp"
+ 
+ 
+ #include "benchmark-utils.hpp"
+ 
+ #define BLAS3_MATRIX_SIZE   1920
+ 
+ template<typename ScalarType>
+ int run_benchmark()
+ {
+   Timer timer;
+   double exec_time;
+ 
+   //
+   // Set up some ViennaCL objects
+   //
+ #ifdef VIENNACL_WITH_OPENCL
+   viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
+ #endif
+ 
+   //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+   //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+   viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+   viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+   viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+ 
+   //
+   // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
+   //
+   std::vector<ScalarType> stl_A(vcl_A.internal_size());
+   std::vector<ScalarType> stl_B(vcl_A.internal_size());
+ 
+   //
+   // Fill the matrix
+   //
+   for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
+     for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
+       stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();
+ 
+   for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
+     for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
+       stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();
+ 
+ 
+   /////////////////////////////////////////////////
+   //////////// Matrix-matrix products /////////////
+   /////////////////////////////////////////////////
+ 
+   //
+   // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+   //
+ 
+   std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
+ 
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+ #else
+   std::vector<long> devices(1);
+ #endif
+   for (std::size_t i=0; i<devices.size(); ++i)
+   {
+ #ifdef VIENNACL_WITH_OPENCL
+     viennacl::ocl::current_context().switch_device(devices[i]);
+     std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+ #endif
+ 
+     viennacl::fast_copy(&(stl_A[0]),
+                         &(stl_A[0]) + stl_A.size(),
+                         vcl_A);
+     viennacl::fast_copy(&(stl_B[0]),
+                         &(stl_B[0]) + stl_B.size(),
+                         vcl_B);
+     vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+     viennacl::backend::finish();
+     timer.start();
+     vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+     viennacl::backend::finish();
+     exec_time = timer.get();
+     std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+     std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
+     std::cout << std::endl;
+   }
+ 
+   std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;
+ 
+   viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
+   for (std::size_t i=0; i<devices.size(); ++i)
+   {
+ #ifdef VIENNACL_WITH_OPENCL
+     viennacl::ocl::current_context().switch_device(devices[i]);
+     std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+ #endif
+ 
+     viennacl::fast_copy(&(stl_A[0]),
+                         &(stl_A[0]) + stl_A.size(),
+                         vcl_A);
+     viennacl::fast_copy(&(stl_B[0]),
+                         &(stl_B[0]) + stl_B.size(),
+                         vcl_B);
+     viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
+     viennacl::backend::finish();
+     timer.start();
+     viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
+     viennacl::backend::finish();
+     exec_time = timer.get();
+     std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+     std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
+     std::cout << std::endl;
+   }
+ 
+   std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;
+ 
+   viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
+   for (std::size_t i=0; i<devices.size(); ++i)
+   {
+ #ifdef VIENNACL_WITH_OPENCL
+     viennacl::ocl::current_context().switch_device(devices[i]);
+     std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+ #endif
+ 
+     viennacl::fast_copy(&(stl_A[0]),
+                         &(stl_A[0]) + stl_A.size(),
+                         vcl_A);
+     viennacl::fast_copy(&(stl_B[0]),
+                         &(stl_B[0]) + stl_B.size(),
+                         vcl_B);
+     viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
+     viennacl::backend::finish();
+     timer.start();
+     viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
+     viennacl::backend::finish();
+     exec_time = timer.get();
+     std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+     std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
+     std::cout << std::endl;
+   }
+ 
+ 
+   std::cout << " ------ Benchmark 4: LU factorization ------ " << std::endl;
+ 
+   for (std::size_t i=0; i<devices.size(); ++i)
+   {
+ #ifdef VIENNACL_WITH_OPENCL
+     viennacl::ocl::current_context().switch_device(devices[i]);
+     std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+ #endif
+ 
+     viennacl::fast_copy(&(stl_A[0]),
+                         &(stl_A[0]) + stl_A.size(),
+                         vcl_A);
+     viennacl::linalg::lu_factorize(vcl_A);
+     viennacl::backend::finish();
+     timer.start();
+     viennacl::linalg::lu_factorize(vcl_A);
+     viennacl::backend::finish();
+     exec_time = timer.get();
+     std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+     std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_A.size2() / 1000.0) / exec_time << std::endl;
+     std::cout << std::endl;
+   }
+ 
+   return EXIT_SUCCESS;
+ }
+ 
+ int main()
+ {
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "               Device Info" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   std::cout << viennacl::ocl::current_device().info() << std::endl;
+ #endif
+ 
+ 
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Benchmark :: Dense Matrix-Matrix product " << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   std::cout << "   -------------------------------" << std::endl;
+   std::cout << "   # benchmarking single-precision" << std::endl;
+   std::cout << "   -------------------------------" << std::endl;
+   run_benchmark<float>();
+ #ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+ #endif
+   {
+     std::cout << std::endl;
+     std::cout << "   -------------------------------" << std::endl;
+     std::cout << "   # benchmarking double-precision" << std::endl;
+     std::cout << "   -------------------------------" << std::endl;
+     run_benchmark<double>();
+   }
+   return 0;
+ }
++>>>>>>> upstream/1.5.1
diff --cc examples/benchmarks/solver.cpp
index f21aaeb,5a27a3c..09c6093
--- a/examples/benchmarks/solver.cpp
+++ b/examples/benchmarks/solver.cpp
@@@ -1,451 -1,645 +1,1099 @@@
++<<<<<<< HEAD
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +#ifndef NDEBUG
 + #define NDEBUG
 +#endif
 +
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/operation_sparse.hpp>
 +
 +#define VIENNACL_HAVE_UBLAS 1
 +
 +#include "viennacl/scalar.hpp"
 +#include "viennacl/vector.hpp"
 +#include "viennacl/coordinate_matrix.hpp"
 +#include "viennacl/compressed_matrix.hpp"
 +#include "viennacl/ell_matrix.hpp"
 +#include "viennacl/hyb_matrix.hpp"
 +#include "viennacl/linalg/ilu.hpp"
 +#include "viennacl/linalg/jacobi_precond.hpp"
 +#include "viennacl/linalg/row_scaling.hpp"
 +#include "viennacl/linalg/cg.hpp"
 +#include "viennacl/linalg/bicgstab.hpp"
 +#include "viennacl/linalg/gmres.hpp"
 +#include "viennacl/io/matrix_market.hpp"
 +
 +
 +#include <iostream>
 +#include <vector>
 +#include "benchmark-utils.hpp"
 +#include "io.hpp"
 +
 +
 +using namespace boost::numeric;
 +
 +/*
 +*   Benchmark:
 +*   Iterative solver tests
 +*   
 +*/
 +
 +#define BENCHMARK_RUNS          1
 +
 +
 +template <typename ScalarType>
 +ScalarType diff_inf(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 +{
 +   ublas::vector<ScalarType> v2_cpu(v2.size());
 +   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
 +
 +   for (unsigned int i=0;i<v1.size(); ++i)
 +   {
 +      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
 +         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
 +      else
 +         v2_cpu[i] = 0.0;
 +   }
 +
 +   return norm_inf(v2_cpu);
 +}
 +
 +template <typename ScalarType>
 +ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 +{
 +   ublas::vector<ScalarType> v2_cpu(v2.size());
 +   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
 +
 +   return norm_2(v1 - v2_cpu) / norm_2(v1);
 +}
 +
 +
 +template <typename MatrixType, typename VectorType, typename SolverTag, typename PrecondTag>
 +void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType const & ref_result, SolverTag const & solver, PrecondTag const & precond, long ops)
 +{
 +  Timer timer;
 +  VectorType result(rhs);
 +  VectorType residual(rhs);
 +  viennacl::ocl::get_queue().finish();
 +  
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    result = viennacl::linalg::solve(matrix, rhs, solver, precond);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  double exec_time = timer.get();
 +  std::cout << "Exec. time: " << exec_time << std::endl;
 +  std::cout << "Est. "; printOps(ops, exec_time / BENCHMARK_RUNS);
 +  residual -= viennacl::linalg::prod(matrix, result);
 +  std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
 +  std::cout << "Estimated rel. residual: " << solver.error() << std::endl;
 +  std::cout << "Iterations: " << solver.iters() << std::endl;
 +  result -= ref_result;
 +  std::cout << "Relative deviation from result: " << viennacl::linalg::norm_2(result) / viennacl::linalg::norm_2(ref_result) << std::endl;
 +}
 +
 +
 +template<typename ScalarType>
 +int run_benchmark()
 +{
 +  
 +  Timer timer;
 +  double exec_time;
 +   
 +  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
 +  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
 +  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
 +  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
 +  
 +  ublas::vector<ScalarType> ublas_vec1;
 +  ublas::vector<ScalarType> ublas_vec2;
 +  ublas::vector<ScalarType> ublas_result;
 +  unsigned int solver_iters = 20;
 +  unsigned int solver_krylov_dim = 20;
 +  double solver_tolerance = 1e-6;
 +
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile<ScalarType>("../../examples/testdata/rhs65025.txt", ublas_vec1))
 +  #else
 +  if (!readVectorFromFile<ScalarType>("../examples/testdata/rhs65025.txt", ublas_vec1))
 +  #endif
 +  {
 +    std::cout << "Error reading RHS file" << std::endl;
 +    return 0;
 +  }
 +  std::cout << "done reading rhs" << std::endl;
 +  ublas_vec2 = ublas_vec1;
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_result))
 +  #else
 +  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_result))
 +  #endif
 +  {
 +    std::cout << "Error reading result file" << std::endl;
 +    return 0;
 +  }
 +  std::cout << "done reading result" << std::endl;
 +  
 +  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec1.size(), ublas_vec1.size());
 +  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix(ublas_vec1.size(), ublas_vec1.size());
 +  viennacl::ell_matrix<ScalarType> vcl_ell_matrix(ublas_vec1.size(), ublas_vec1.size());
 +  viennacl::hyb_matrix<ScalarType> vcl_hyb_matrix(ublas_vec1.size(), ublas_vec1.size());
 +
 +  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
 +  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
 +  viennacl::vector<ScalarType> vcl_result(ublas_vec1.size()); 
 +  
 +
 +  ublas::compressed_matrix<ScalarType> ublas_matrix;
 +  #ifdef _MSC_VER
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
 +  #else
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
 +  #endif
 +  {
 +    std::cout << "Error reading Matrix file" << std::endl;
 +    return EXIT_FAILURE;
 +  }
 +  //unsigned int cg_mat_size = cg_mat.size(); 
 +  std::cout << "done reading matrix" << std::endl;
 +  
 +  //cpu to gpu:
 +  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
 +  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
 +  viennacl::copy(ublas_matrix, vcl_ell_matrix);
 +  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
 +  viennacl::copy(ublas_vec1, vcl_vec1);
 +  viennacl::copy(ublas_vec2, vcl_vec2);
 +  viennacl::copy(ublas_result, vcl_result);
 +  
 +  
 +  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
 +  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
 +  
 +  viennacl::linalg::row_scaling< ublas::compressed_matrix<ScalarType> >    ublas_row_scaling(ublas_matrix, viennacl::linalg::row_scaling_tag(1));
 +  viennacl::linalg::row_scaling< viennacl::compressed_matrix<ScalarType> > vcl_row_scaling(vcl_compressed_matrix, viennacl::linalg::row_scaling_tag(1));
 +  
 +  ///////////////////////////////////////////////////////////////////////////////
 +  //////////////////////           ILU preconditioner         //////////////////
 +  ///////////////////////////////////////////////////////////////////////////////
 +  std::cout << "------- ILU0 on CPU (ublas) ----------" << std::endl;
 +
 +  timer.start();
 +  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
 +  exec_time = timer.get();
 +  std::cout << "Setup time: " << exec_time << std::endl;
 +  
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    ublas_ilu0.apply(ublas_vec1);
 +  }
 +  exec_time = timer.get();
 +  std::cout << "ublas time: " << exec_time << std::endl;
 +  
 +  std::cout << "------- ILU0 with ViennaCL ----------" << std::endl;
 +
 +  timer.start();
 +  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
 +  exec_time = timer.get();
 +  std::cout << "Setup time: " << exec_time << std::endl;
 +  
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_ilu0.apply(vcl_vec1);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "ViennaCL time: " << exec_time << std::endl;
 +  
 +  
 +  std::cout << "------- ILUT on CPU (ublas) ----------" << std::endl;
 +
 +  timer.start();
 +  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
 +  exec_time = timer.get();
 +  std::cout << "Setup time: " << exec_time << std::endl;
 +  
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    ublas_ilut.apply(ublas_vec1);
 +  }
 +  exec_time = timer.get();
 +  std::cout << "ublas time: " << exec_time << std::endl;
 +
 +  std::cout << "------- ILUT with ViennaCL ----------" << std::endl;
 +
 +  timer.start();
 +  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
 +  exec_time = timer.get();
 +  std::cout << "Setup time: " << exec_time << std::endl;
 +  
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_ilut.apply(vcl_vec1);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "ViennaCL time: " << exec_time << std::endl;
 +  
 +  ///////////////////////////////////////////////////////////////////////////////
 +  //////////////////////              CG solver                //////////////////
 +  ///////////////////////////////////////////////////////////////////////////////
 +  long cg_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + 6 * ublas_vec2.size()));
 +  
 +  viennacl::linalg::cg_tag cg_solver(solver_tolerance, solver_iters);
 +  
 +  std::cout << "------- CG solver (no preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
 +  
 +  std::cout << "------- CG solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
 + 
 +  std::cout << "------- CG solver (no preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
 +
 +  std::cout << "------- CG solver (no preconditioner) via ViennaCL, ell_matrix ----------" << std::endl;
 +  run_solver(vcl_ell_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
 +
 +  std::cout << "------- CG solver (no preconditioner) via ViennaCL, hyb_matrix ----------" << std::endl;
 +  run_solver(vcl_hyb_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
 +  
 +
 +  std::cout << "------- CG solver (ILU0 preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilu0, cg_ops);
 +
 +  std::cout << "------- CG solver (ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilu0, cg_ops);
 +  
 +  std::cout << "------- CG solver (ILUT preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilut, cg_ops);
 +  
 +  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
 +  
 +//  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
 +  
 +  
 +  std::cout << "------- CG solver (Jacobi preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_jacobi, cg_ops);
 +  
 +  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi, cg_ops);
 +  
 +//  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi, cg_ops);
 +  
 +  
 +  std::cout << "------- CG solver (row scaling preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_row_scaling, cg_ops);
 +  
 +  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling, cg_ops);
 +  
 +//  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling, cg_ops);
 +  
 +  ///////////////////////////////////////////////////////////////////////////////
 +  //////////////////////           BiCGStab solver             //////////////////
 +  ///////////////////////////////////////////////////////////////////////////////
 +  
 +  long bicgstab_ops = static_cast<long>(solver_iters * (2 * ublas_matrix.nnz() + 13 * ublas_vec2.size()));
 +  
 +  viennacl::linalg::bicgstab_tag bicgstab_solver(solver_tolerance, solver_iters);
 +                                                                             
 +  std::cout << "------- BiCGStab solver (no preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
 +  
 +  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
 +  
 +//  std::cout << "------- BiCGStab solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, bicgstab_ops);
 +
 +  
 +  std::cout << "------- BiCGStab solver (ILUT preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_ilut, bicgstab_ops);
 +  
 +  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
 +  
 +//  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
 +  
 +  std::cout << "------- BiCGStab solver (Jacobi preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_jacobi, bicgstab_ops);
 +  
 +  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi, bicgstab_ops);
 +  
 +//  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi, bicgstab_ops);
 +  
 +  std::cout << "------- BiCGStab solver (row scaling preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_row_scaling, bicgstab_ops);
 +  
 +  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling, bicgstab_ops);
 +  
 +//  std::cout << "------- CG solver row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling, bicgstab_ops);
 +
 +  ///////////////////////////////////////////////////////////////////////////////
 +  ///////////////////////            GMRES solver             ///////////////////
 +  ///////////////////////////////////////////////////////////////////////////////
 +  
 +  long gmres_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + (solver_iters * 2 + 7) * ublas_vec2.size()));
 +  
 +  viennacl::linalg::gmres_tag gmres_solver(solver_tolerance, solver_iters, solver_krylov_dim);
 +  
 +  std::cout << "------- GMRES solver (no preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
 +  
 +  std::cout << "------- GMRES solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
 +  
 +//  std::cout << "------- GMRES solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, bicgstab_ops);
 +
 +  
 +  std::cout << "------- GMRES solver (ILUT preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_ilut, gmres_ops);
 +  
 +  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
 +  
 +//  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
 +
 +
 +  std::cout << "------- GMRES solver (Jacobi preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_jacobi, gmres_ops);
 +  
 +  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi, gmres_ops);
 +  
 +//  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi, gmres_ops);
 +  
 +  
 +  std::cout << "------- GMRES solver (row scaling preconditioner) using ublas ----------" << std::endl;
 +  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_row_scaling, gmres_ops);
 +  
 +  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
 +  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling, gmres_ops);
 +  
 +//  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
 +//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling, gmres_ops);
 +  
 +  return 0;
 +}
 +
 +int main()
 +{
 +  std::cout << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "               Device Info" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  
 +  std::cout << viennacl::ocl::current_device().info() << std::endl;
 +  
 +  std::cout << "---------------------------------------------------------------------------" << std::endl;
 +  std::cout << "---------------------------------------------------------------------------" << std::endl;
 +  std::cout << " Benchmark for Execution Times of Iterative Solvers provided with ViennaCL " << std::endl;
 +  std::cout << "---------------------------------------------------------------------------" << std::endl;
 +  std::cout << " Note that the purpose of this benchmark is not to run solvers until" << std::endl;
 +  std::cout << " convergence. Instead, only the execution times of a few iterations are" << std::endl;
 +  std::cout << " recorded. Residual errors are only printed for information." << std::endl << std::endl;
 +   
 +
 +  std::cout << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "## Benchmark :: Solver" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << std::endl;
 +  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs with Stream SDK v2.1." << std::endl;
 +  std::cout << "   -------------------------------" << std::endl;
 +  std::cout << "   # benchmarking single-precision" << std::endl;
 +  std::cout << "   -------------------------------" << std::endl;
 +  run_benchmark<float>();
 +  if( viennacl::ocl::current_device().double_support() )
 +  {
 +    std::cout << std::endl;
 +    std::cout << "   -------------------------------" << std::endl;
 +    std::cout << "   # benchmarking double-precision" << std::endl;
 +    std::cout << "   -------------------------------" << std::endl;
 +    run_benchmark<double>();
 +  }
 +  return 0;
 +}
 +
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /*
+ *
+ *   Benchmark:  Iterative solver tests (solver.cpp and solver.cu are identical, the latter being required for compilation using CUDA nvcc)
+ *
+ */
+ 
+ 
+ #ifndef NDEBUG
+  #define NDEBUG
+ #endif
+ 
+ #include <boost/numeric/ublas/matrix_sparse.hpp>
+ #include <boost/numeric/ublas/io.hpp>
+ #include <boost/numeric/ublas/operation_sparse.hpp>
+ 
+ #define VIENNACL_WITH_UBLAS 1
+ 
+ #include "viennacl/scalar.hpp"
+ #include "viennacl/vector.hpp"
+ #include "viennacl/coordinate_matrix.hpp"
+ #include "viennacl/compressed_matrix.hpp"
+ #include "viennacl/ell_matrix.hpp"
+ #include "viennacl/hyb_matrix.hpp"
+ #include "viennacl/context.hpp"
+ 
+ #include "viennacl/linalg/cg.hpp"
+ #include "viennacl/linalg/bicgstab.hpp"
+ #include "viennacl/linalg/gmres.hpp"
+ 
+ #include "viennacl/linalg/ilu.hpp"
+ #include "viennacl/linalg/ichol.hpp"
+ #include "viennacl/linalg/jacobi_precond.hpp"
+ #include "viennacl/linalg/row_scaling.hpp"
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/mixed_precision_cg.hpp"
+ #endif
+ 
+ #include "viennacl/io/matrix_market.hpp"
+ 
+ 
+ #include <iostream>
+ #include <vector>
+ #include "benchmark-utils.hpp"
+ #include "io.hpp"
+ 
+ 
+ using namespace boost::numeric;
+ 
+ #define BENCHMARK_RUNS          1
+ 
+ 
+ template <typename ScalarType>
+ ScalarType diff_inf(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+ {
+    ublas::vector<ScalarType> v2_cpu(v2.size());
+    viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+ 
+    for (unsigned int i=0;i<v1.size(); ++i)
+    {
+       if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
+          v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
+       else
+          v2_cpu[i] = 0.0;
+    }
+ 
+    return norm_inf(v2_cpu);
+ }
+ 
+ template <typename ScalarType>
+ ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+ {
+    ublas::vector<ScalarType> v2_cpu(v2.size());
+    viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+ 
+    return norm_2(v1 - v2_cpu) / norm_2(v1);
+ }
+ 
+ 
+ template <typename MatrixType, typename VectorType, typename SolverTag, typename PrecondTag>
+ void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType const & ref_result, SolverTag const & solver, PrecondTag const & precond, long ops)
+ {
+   Timer timer;
+   VectorType result(rhs);
+   VectorType residual(rhs);
+   viennacl::backend::finish();
+ 
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     result = viennacl::linalg::solve(matrix, rhs, solver, precond);
+   }
+   viennacl::backend::finish();
+   double exec_time = timer.get();
+   std::cout << "Exec. time: " << exec_time << std::endl;
+   std::cout << "Est. "; printOps(static_cast<double>(ops), exec_time / BENCHMARK_RUNS);
+   residual -= viennacl::linalg::prod(matrix, result);
+   std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
+   std::cout << "Estimated rel. residual: " << solver.error() << std::endl;
+   std::cout << "Iterations: " << solver.iters() << std::endl;
+   result -= ref_result;
+   std::cout << "Relative deviation from result: " << viennacl::linalg::norm_2(result) / viennacl::linalg::norm_2(ref_result) << std::endl;
+ }
+ 
+ 
+ template<typename ScalarType>
+ int run_benchmark(viennacl::context ctx)
+ {
+   Timer timer;
+   double exec_time;
+ 
+   ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
+   ScalarType std_factor2 = static_cast<ScalarType>(42.0);
+   viennacl::scalar<ScalarType> vcl_factor1(std_factor1, ctx);
+   viennacl::scalar<ScalarType> vcl_factor2(std_factor2, ctx);
+ 
+   ublas::vector<ScalarType> ublas_vec1;
+   ublas::vector<ScalarType> ublas_vec2;
+   ublas::vector<ScalarType> ublas_result;
+   unsigned int solver_iters = 100;
+   unsigned int solver_krylov_dim = 20;
+   double solver_tolerance = 1e-6;
+ 
+   if (!readVectorFromFile<ScalarType>("../examples/testdata/rhs65025.txt", ublas_vec1))
+   {
+     std::cout << "Error reading RHS file" << std::endl;
+     return 0;
+   }
+   std::cout << "done reading rhs" << std::endl;
+   ublas_vec2 = ublas_vec1;
+   if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_result))
+   {
+     std::cout << "Error reading result file" << std::endl;
+     return 0;
+   }
+   std::cout << "done reading result" << std::endl;
+ 
+   viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec1.size(), ublas_vec1.size(), ctx);
+   viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix(ublas_vec1.size(), ublas_vec1.size(), ctx);
+   viennacl::ell_matrix<ScalarType> vcl_ell_matrix(ctx);
+   viennacl::hyb_matrix<ScalarType> vcl_hyb_matrix(ctx);
+ 
+   viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size(), ctx);
+   viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size(), ctx);
+   viennacl::vector<ScalarType> vcl_result(ublas_vec1.size(), ctx);
+ 
+ 
+   ublas::compressed_matrix<ScalarType> ublas_matrix;
+   if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+   {
+     std::cout << "Error reading Matrix file" << std::endl;
+     return EXIT_FAILURE;
+   }
+   //unsigned int cg_mat_size = cg_mat.size();
+   std::cout << "done reading matrix" << std::endl;
+ 
+   //cpu to gpu:
+   viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+   viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+   viennacl::copy(ublas_matrix, vcl_ell_matrix);
+   viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+   viennacl::copy(ublas_vec1, vcl_vec1);
+   viennacl::copy(ublas_vec2, vcl_vec2);
+   viennacl::copy(ublas_result, vcl_result);
+ 
+ 
+   std::cout << "------- Jacobi preconditioner ----------" << std::endl;
+   viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
+   viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi_csr(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
+   viennacl::linalg::jacobi_precond< viennacl::coordinate_matrix<ScalarType> > vcl_jacobi_coo(vcl_coordinate_matrix, viennacl::linalg::jacobi_tag());
+ 
+   std::cout << "------- Row-Scaling preconditioner ----------" << std::endl;
+   viennacl::linalg::row_scaling< ublas::compressed_matrix<ScalarType> >    ublas_row_scaling(ublas_matrix, viennacl::linalg::row_scaling_tag(1));
+   viennacl::linalg::row_scaling< viennacl::compressed_matrix<ScalarType> > vcl_row_scaling_csr(vcl_compressed_matrix, viennacl::linalg::row_scaling_tag(1));
+   viennacl::linalg::row_scaling< viennacl::coordinate_matrix<ScalarType> > vcl_row_scaling_coo(vcl_coordinate_matrix, viennacl::linalg::row_scaling_tag(1));
+ 
+   ///////////////////////////////////////////////////////////////////////////////
+   //////////////////////  Incomplete Cholesky preconditioner   //////////////////
+   ///////////////////////////////////////////////////////////////////////////////
+   std::cout << "------- ICHOL0 on CPU (ublas) ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ichol0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ichol0(ublas_matrix, viennacl::linalg::ichol0_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time: " << exec_time << std::endl;
+ 
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     ublas_ichol0.apply(ublas_vec1);
+   exec_time = timer.get();
+   std::cout << "ublas time: " << exec_time << std::endl;
+ 
+   std::cout << "------- ICHOL0 with ViennaCL ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ichol0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ichol0(vcl_compressed_matrix, viennacl::linalg::ichol0_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time: " << exec_time << std::endl;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_ichol0.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL time: " << exec_time << std::endl;
+ 
+ 
+   ///////////////////////////////////////////////////////////////////////////////
+   //////////////////////           ILU preconditioner         //////////////////
+   ///////////////////////////////////////////////////////////////////////////////
+   std::cout << "------- ILU0 on with ublas ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     ublas_ilu0.apply(ublas_vec1);
+   exec_time = timer.get();
+   std::cout << "ublas ILU0 substitution time (no level scheduling): " << exec_time << std::endl;
+ 
+ 
+   std::cout << "------- ILU0 with ViennaCL ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_ilu0.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL ILU0 substitution time (no level scheduling): " << exec_time << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ilu0_tag ilu0_with_level_scheduling; ilu0_with_level_scheduling.use_level_scheduling(true);
+   viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0_level_scheduling(vcl_compressed_matrix, ilu0_with_level_scheduling);
+   exec_time = timer.get();
+   std::cout << "Setup time (with level scheduling): " << exec_time << std::endl;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_ilu0_level_scheduling.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL ILU0 substitution time (with level scheduling): " << exec_time << std::endl;
+ 
+ 
+ 
+   ////////////////////////////////////////////
+ 
+   std::cout << "------- Block-ILU0 with ublas ----------" << std::endl;
+ 
+   ublas_vec1 = ublas_vec2;
+   viennacl::copy(ublas_vec1, vcl_vec1);
+ 
+   timer.start();
+   viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time: " << exec_time << std::endl;
+ 
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     ublas_block_ilu0.apply(ublas_vec1);
+   exec_time = timer.get();
+   std::cout << "ublas time: " << exec_time << std::endl;
+ 
+   std::cout << "------- Block-ILU0 with ViennaCL ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time: " << exec_time << std::endl;
+ 
+   //vcl_block_ilu0.apply(vcl_vec1);  //warm-up
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_block_ilu0.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL time: " << exec_time << std::endl;
+ 
+   ////////////////////////////////////////////
+ 
+   std::cout << "------- ILUT with ublas ----------" << std::endl;
+ 
+   ublas_vec1 = ublas_vec2;
+   viennacl::copy(ublas_vec1, vcl_vec1);
+ 
+   timer.start();
+   viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     ublas_ilut.apply(ublas_vec1);
+   exec_time = timer.get();
+   std::cout << "ublas ILUT substitution time (no level scheduling): " << exec_time << std::endl;
+ 
+ 
+   std::cout << "------- ILUT with ViennaCL ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_ilut.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL ILUT substitution time (no level scheduling): " << exec_time << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::ilut_tag ilut_with_level_scheduling; ilut_with_level_scheduling.use_level_scheduling(true);
+   viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut_level_scheduling(vcl_compressed_matrix, ilut_with_level_scheduling);
+   exec_time = timer.get();
+   std::cout << "Setup time (with level scheduling): " << exec_time << std::endl;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_ilut_level_scheduling.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL ILUT substitution time (with level scheduling): " << exec_time << std::endl;
+ 
+ 
+   ////////////////////////////////////////////
+ 
+   std::cout << "------- Block-ILUT with ublas ----------" << std::endl;
+ 
+   ublas_vec1 = ublas_vec2;
+   viennacl::copy(ublas_vec1, vcl_vec1);
+ 
+   timer.start();
+   viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilut_tag>          ublas_block_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time: " << exec_time << std::endl;
+ 
+   //ublas_block_ilut.apply(ublas_vec1);
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     ublas_block_ilut.apply(ublas_vec1);
+   exec_time = timer.get();
+   std::cout << "ublas time: " << exec_time << std::endl;
+ 
+   std::cout << "------- Block-ILUT with ViennaCL ----------" << std::endl;
+ 
+   timer.start();
+   viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilut_tag>          vcl_block_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+   exec_time = timer.get();
+   std::cout << "Setup time: " << exec_time << std::endl;
+ 
+   //vcl_block_ilut.apply(vcl_vec1);  //warm-up
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+     vcl_block_ilut.apply(vcl_vec1);
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "ViennaCL time: " << exec_time << std::endl;
+ 
+ 
+   ///////////////////////////////////////////////////////////////////////////////
+   //////////////////////              CG solver                //////////////////
+   ///////////////////////////////////////////////////////////////////////////////
+   long cg_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + 6 * ublas_vec2.size()));
+ 
+   viennacl::linalg::cg_tag cg_solver(solver_tolerance, solver_iters);
+ 
+   std::cout << "------- CG solver (no preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+ 
+   std::cout << "------- CG solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   if (sizeof(ScalarType) == sizeof(double))
+   {
+     std::cout << "------- CG solver, mixed precision (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+     viennacl::linalg::mixed_precision_cg_tag mixed_precision_cg_solver(solver_tolerance, solver_iters);
+ 
+     run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, mixed_precision_cg_solver, viennacl::linalg::no_precond(), cg_ops);
+     run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, mixed_precision_cg_solver, viennacl::linalg::no_precond(), cg_ops);
+   }
+ #endif
+ 
+   std::cout << "------- CG solver (no preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+ 
+   std::cout << "------- CG solver (no preconditioner) via ViennaCL, ell_matrix ----------" << std::endl;
+   run_solver(vcl_ell_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+ 
+   std::cout << "------- CG solver (no preconditioner) via ViennaCL, hyb_matrix ----------" << std::endl;
+   run_solver(vcl_hyb_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+ 
+   std::cout << "------- CG solver (ICHOL0 preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ichol0, cg_ops);
+ 
+   std::cout << "------- CG solver (ICHOL0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ichol0, cg_ops);
+ 
+ 
+   std::cout << "------- CG solver (ILU0 preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilu0, cg_ops);
+ 
+   std::cout << "------- CG solver (ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilu0, cg_ops);
+ 
+ 
+   std::cout << "------- CG solver (Block-ILU0 preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_block_ilu0, cg_ops);
+ 
+   std::cout << "------- CG solver (Block-ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_block_ilu0, cg_ops);
+ 
+   std::cout << "------- CG solver (ILUT preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilut, cg_ops);
+ 
+   std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
+ 
+   std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
+ 
+   std::cout << "------- CG solver (Block-ILUT preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_block_ilut, cg_ops);
+ 
+   std::cout << "------- CG solver (Block-ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_block_ilut, cg_ops);
+ 
+   std::cout << "------- CG solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_jacobi, cg_ops);
+ 
+   std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi_csr, cg_ops);
+ 
+   std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi_coo, cg_ops);
+ 
+ 
+   std::cout << "------- CG solver (row scaling preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_row_scaling, cg_ops);
+ 
+   std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling_csr, cg_ops);
+ 
+   std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling_coo, cg_ops);
+ 
+ 
+   ///////////////////////////////////////////////////////////////////////////////
+   //////////////////////           BiCGStab solver             //////////////////
+   ///////////////////////////////////////////////////////////////////////////////
+ 
+   long bicgstab_ops = static_cast<long>(solver_iters * (2 * ublas_matrix.nnz() + 13 * ublas_vec2.size()));
+ 
+   viennacl::linalg::bicgstab_tag bicgstab_solver(solver_tolerance, solver_iters);
+ 
+   std::cout << "------- BiCGStab solver (no preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+ 
+ 
+   std::cout << "------- BiCGStab solver (ILUT preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_ilut, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (Block-ILUT preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_block_ilut, bicgstab_ops);
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   std::cout << "------- BiCGStab solver (Block-ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_block_ilut, bicgstab_ops);
+ #endif
+ 
+ //  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+ //  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_jacobi, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi_csr, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi_coo, bicgstab_ops);
+ 
+ 
+   std::cout << "------- BiCGStab solver (row scaling preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_row_scaling, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling_csr, bicgstab_ops);
+ 
+   std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling_coo, bicgstab_ops);
+ 
+ 
+   ///////////////////////////////////////////////////////////////////////////////
+   ///////////////////////            GMRES solver             ///////////////////
+   ///////////////////////////////////////////////////////////////////////////////
+ 
+   long gmres_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + (solver_iters * 2 + 7) * ublas_vec2.size()));
+ 
+   viennacl::linalg::gmres_tag gmres_solver(solver_tolerance, solver_iters, solver_krylov_dim);
+ 
+   std::cout << "------- GMRES solver (no preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+ 
+   std::cout << "------- GMRES solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+ 
+   std::cout << "------- GMRES solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+ 
+ 
+   std::cout << "------- GMRES solver (ILUT preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_ilut, gmres_ops);
+ 
+   std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
+ 
+   std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
+ 
+ 
+   std::cout << "------- GMRES solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_jacobi, gmres_ops);
+ 
+   std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi_csr, gmres_ops);
+ 
+   std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi_coo, gmres_ops);
+ 
+ 
+   std::cout << "------- GMRES solver (row scaling preconditioner) using ublas ----------" << std::endl;
+   run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_row_scaling, gmres_ops);
+ 
+   std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+   run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling_csr, gmres_ops);
+ 
+   std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+   run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling_coo, gmres_ops);
+ 
+   return EXIT_SUCCESS;
+ }
+ 
+ int main()
+ {
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "               Device Info" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+   std::vector<viennacl::ocl::device> const & devices = pf.devices();
+ 
+   // Set first device to first context:
+   viennacl::ocl::setup_context(0, devices[0]);
+ 
+   // Set second device for second context (use the same device for the second context if only one device available):
+   if (devices.size() > 1)
+     viennacl::ocl::setup_context(1, devices[1]);
+   else
+     viennacl::ocl::setup_context(1, devices[0]);
+ 
+   std::cout << viennacl::ocl::current_device().info() << std::endl;
+   viennacl::context ctx(viennacl::ocl::get_context(1));
+ #else
+   viennacl::context ctx;
+ #endif
+ 
+   std::cout << "---------------------------------------------------------------------------" << std::endl;
+   std::cout << "---------------------------------------------------------------------------" << std::endl;
+   std::cout << " Benchmark for Execution Times of Iterative Solvers provided with ViennaCL " << std::endl;
+   std::cout << "---------------------------------------------------------------------------" << std::endl;
+   std::cout << " Note that the purpose of this benchmark is not to run solvers until" << std::endl;
+   std::cout << " convergence. Instead, only the execution times of a few iterations are" << std::endl;
+   std::cout << " recorded. Residual errors are only printed for information." << std::endl << std::endl;
+ 
+ 
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Benchmark :: Solver" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   std::cout << "   -------------------------------" << std::endl;
+   std::cout << "   # benchmarking single-precision" << std::endl;
+   std::cout << "   -------------------------------" << std::endl;
+   run_benchmark<float>(ctx);
+ #ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+ #endif
+   {
+     std::cout << std::endl;
+     std::cout << "   -------------------------------" << std::endl;
+     std::cout << "   # benchmarking double-precision" << std::endl;
+     std::cout << "   -------------------------------" << std::endl;
+     run_benchmark<double>(ctx);
+   }
+   return 0;
+ }
+ 
++>>>>>>> upstream/1.5.1
diff --cc examples/benchmarks/sparse.cpp
index 5f9aed7,ce03884..c458069
--- a/examples/benchmarks/sparse.cpp
+++ b/examples/benchmarks/sparse.cpp
@@@ -1,303 -1,322 +1,628 @@@
++<<<<<<< HEAD
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +//#define VIENNACL_BUILD_INFO
 +#ifndef NDEBUG
 + #define NDEBUG
 +#endif
 +
 +#define VIENNACL_HAVE_UBLAS 1
 +
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/operation_sparse.hpp>
 +
 +
 +#include "viennacl/scalar.hpp"
 +#include "viennacl/vector.hpp"
 +#include "viennacl/coordinate_matrix.hpp"
 +#include "viennacl/compressed_matrix.hpp"
 +#include "viennacl/ell_matrix.hpp"
 +#include "viennacl/hyb_matrix.hpp"
 +#include "viennacl/linalg/prod.hpp"
 +#include "viennacl/linalg/norm_2.hpp"
 +#include "viennacl/io/matrix_market.hpp"
 +
 +
 +#include <iostream>
 +#include <vector>
 +#include "benchmark-utils.hpp"
 +#include "io.hpp"
 +
 +
 +/*
 +*   Benchmark 1:
 +*   Vector tests
 +*   
 +*/
 +
 +#define BENCHMARK_RUNS          10
 +
 +
 +template<typename ScalarType>
 +int run_benchmark()
 +{   
 +   Timer timer;
 +   double exec_time;
 +   
 +   ScalarType std_result = 0;
 +   
 +  ScalarType std_factor1 = ScalarType(3.1415);
 +  ScalarType std_factor2 = ScalarType(42.0);
 +  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
 +  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
 +  
 +  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
 +  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
 +
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_vec1))
 +  #else
 +  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
 +  #endif
 +  {
 +    std::cout << "Error reading RHS file" << std::endl;
 +    return 0;
 +  }
 +  std::cout << "done reading rhs" << std::endl;
 +  ublas_vec2 = ublas_vec1;
 +  
 +  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
 +  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
 +  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
 +  
 +  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
 +
 +  viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
 +  viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
 +
 +  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
 +  #ifdef _MSC_VER
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
 +  #else
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
 +  #endif
 +  {
 +    std::cout << "Error reading Matrix file" << std::endl;
 +    return 0;
 +  }
 +  //unsigned int cg_mat_size = cg_mat.size(); 
 +  std::cout << "done reading matrix" << std::endl;
 +  
 +  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
 +  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
 +  viennacl::vector<ScalarType> vcl_vec3(ublas_vec1.size()); 
 +  
 +  //cpu to gpu:
 +  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
 +  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
 +  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
 +  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
 +  #endif
 +  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
 +  viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
 +  viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
 +  viennacl::copy(ublas_vec1, vcl_vec1);
 +  viennacl::copy(ublas_vec2, vcl_vec2);
 +
 +  
 +  ///////////// Matrix operations /////////////////
 +  
 +  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    ublas_vec1 = prod(ublas_matrix, ublas_vec2);
 +  }
 +  exec_time = timer.get();
 +  std::cout << "CPU time: " << exec_time << std::endl;
 +  std::cout << "CPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << ublas_vec1[0] << std::endl;
 +  
 +  
 +  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
 +  
 +  
 +  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
 +  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
 +  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
 +  std_result = 0.0;
 +  
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "GPU time align1: " << exec_time << std::endl;
 +  std::cout << "GPU align1 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << vcl_vec1[0] << std::endl;
 +
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "GPU time align4: " << exec_time << std::endl;
 +  std::cout << "GPU align4 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << vcl_vec1[0] << std::endl;
 +
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "GPU time align8: " << exec_time << std::endl;
 +  std::cout << "GPU align8 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << vcl_vec1[0] << std::endl;
 +  
 +  
 +  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
 +  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
 +  viennacl::ocl::get_queue().finish();
 +  
 +  viennacl::copy(vcl_vec1, ublas_vec2);  
 +  long err_cnt = 0;
 +  for (size_t i=0; i<ublas_vec1.size(); ++i)
 +  {
 +    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
 +    {
 +      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
 +      ++err_cnt;
 +      if (err_cnt > 5)
 +        break;
 +    }
 +  }
 +  
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "GPU time: " << exec_time << std::endl;
 +  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << vcl_vec1[0] << std::endl;
 +
 +  
 +  std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
 +  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
 +  viennacl::ocl::get_queue().finish();
 +  
 +  viennacl::copy(vcl_vec1, ublas_vec2);  
 +  err_cnt = 0;
 +  for (size_t i=0; i<ublas_vec1.size(); ++i)
 +  {
 +    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
 +    {
 +      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
 +      ++err_cnt;
 +      if (err_cnt > 5)
 +        break;
 +    }
 +  }
 +  
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "GPU time: " << exec_time << std::endl;
 +  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << vcl_vec1[0] << std::endl;
 +
 +  
 +  std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
 +  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
 +  viennacl::ocl::get_queue().finish();
 +  
 +  viennacl::copy(vcl_vec1, ublas_vec2);  
 +  err_cnt = 0;
 +  for (size_t i=0; i<ublas_vec1.size(); ++i)
 +  {
 +    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
 +    {
 +      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
 +      ++err_cnt;
 +      if (err_cnt > 5)
 +        break;
 +    }
 +  }
 +  
 +  viennacl::ocl::get_queue().finish();
 +  timer.start();
 +  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
 +  {
 +    vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
 +  }
 +  viennacl::ocl::get_queue().finish();
 +  exec_time = timer.get();
 +  std::cout << "GPU time: " << exec_time << std::endl;
 +  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
 +  std::cout << vcl_vec1[0] << std::endl;
 +  
 +  
 +  return EXIT_SUCCESS;
 +}
 +
 +
 +int main()
 +{
 +  std::cout << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "               Device Info" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  
 +  std::cout << viennacl::ocl::current_device().info() << std::endl;
 +  
 +  std::cout << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << "## Benchmark :: Sparse" << std::endl;
 +  std::cout << "----------------------------------------------" << std::endl;
 +  std::cout << std::endl;
 +  std::cout << "   -------------------------------" << std::endl;
 +  std::cout << "   # benchmarking single-precision" << std::endl;
 +  std::cout << "   -------------------------------" << std::endl;
 +  run_benchmark<float>();
 +  if( viennacl::ocl::current_device().double_support() )
 +  {
 +    std::cout << std::endl;
 +    std::cout << "   -------------------------------" << std::endl;
 +    std::cout << "   # benchmarking double-precision" << std::endl;
 +    std::cout << "   -------------------------------" << std::endl;
 +    run_benchmark<double>();
 +  }
 +  return 0;
 +}
 +
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ 
+ /*
+ *   Benchmark:  Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
+ *
+ */
+ 
+ //#define VIENNACL_BUILD_INFO
+ #ifndef NDEBUG
+  #define NDEBUG
+ #endif
+ 
+ #define VIENNACL_WITH_UBLAS 1
+ 
+ #include <boost/numeric/ublas/triangular.hpp>
+ #include <boost/numeric/ublas/vector.hpp>
+ #include <boost/numeric/ublas/vector_proxy.hpp>
+ #include <boost/numeric/ublas/matrix_sparse.hpp>
+ #include <boost/numeric/ublas/operation_sparse.hpp>
+ #include <boost/numeric/ublas/lu.hpp>
+ 
+ 
+ #include "viennacl/scalar.hpp"
+ #include "viennacl/vector.hpp"
+ #include "viennacl/coordinate_matrix.hpp"
+ #include "viennacl/compressed_matrix.hpp"
+ #include "viennacl/ell_matrix.hpp"
+ #include "viennacl/hyb_matrix.hpp"
+ #include "viennacl/linalg/prod.hpp"
+ #include "viennacl/linalg/norm_2.hpp"
+ #include "viennacl/io/matrix_market.hpp"
+ #include "viennacl/linalg/ilu.hpp"
+ 
+ 
+ #include <iostream>
+ #include <vector>
+ #include "benchmark-utils.hpp"
+ #include "io.hpp"
+ 
+ 
+ #define BENCHMARK_RUNS          10
+ 
+ 
+ template<typename ScalarType>
+ int run_benchmark()
+ {
+    Timer timer;
+    double exec_time;
+ 
+    //ScalarType std_result = 0;
+ 
+   ScalarType std_factor1 = ScalarType(3.1415);
+   ScalarType std_factor2 = ScalarType(42.0);
+   viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
+   viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
+ 
+   boost::numeric::ublas::vector<ScalarType> ublas_vec1;
+   boost::numeric::ublas::vector<ScalarType> ublas_vec2;
+ 
+   if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
+   {
+     std::cout << "Error reading RHS file" << std::endl;
+     return 0;
+   }
+   std::cout << "done reading rhs" << std::endl;
+   ublas_vec2 = ublas_vec1;
+ 
+   viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
+   viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
+   viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
+ 
+   viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
+ 
+   viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
+   viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
+ 
+   boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
+   if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+   {
+     std::cout << "Error reading Matrix file" << std::endl;
+     return 0;
+   }
+   //unsigned int cg_mat_size = cg_mat.size();
+   std::cout << "done reading matrix" << std::endl;
+ 
+   viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
+   viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
+ 
+   //cpu to gpu:
+   viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
+   #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
+   viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
+   viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
+   #endif
+   viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
+   viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
+   viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
+   viennacl::copy(ublas_vec1, vcl_vec1);
+   viennacl::copy(ublas_vec2, vcl_vec2);
+ 
+ 
+   ///////////// Matrix operations /////////////////
+ 
+   std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
+     boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
+   }
+   exec_time = timer.get();
+   std::cout << "CPU time: " << exec_time << std::endl;
+   std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << ublas_vec1[0] << std::endl;
+ 
+ 
+   std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
+ 
+ 
+   vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
+   vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
+   vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
+   //std_result = 0.0;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
+   }
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "GPU time align1: " << exec_time << std::endl;
+   std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << vcl_vec1[0] << std::endl;
+ 
+   std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
+ 
+   viennacl::copy(ublas_vec1, vcl_vec1);
+   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
+   viennacl::copy(ublas_vec1, vcl_vec1);
+   std::cout << "ublas..." << std::endl;
+   timer.start();
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
+   std::cout << "Time elapsed: " << timer.get() << std::endl;
+   std::cout << "ViennaCL..." << std::endl;
+   viennacl::backend::finish();
+   timer.start();
+   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
+   viennacl::backend::finish();
+   std::cout << "Time elapsed: " << timer.get() << std::endl;
+ 
+   ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
+   }
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "GPU time align4: " << exec_time << std::endl;
+   std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << vcl_vec1[0] << std::endl;
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
+   }
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "GPU time align8: " << exec_time << std::endl;
+   std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << vcl_vec1[0] << std::endl;
+ 
+ 
+   std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
+   vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
+   viennacl::backend::finish();
+ 
+   viennacl::copy(vcl_vec1, ublas_vec2);
+   long err_cnt = 0;
+   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+   {
+     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+     {
+       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+       ++err_cnt;
+       if (err_cnt > 5)
+         break;
+     }
+   }
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
+   }
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "GPU time: " << exec_time << std::endl;
+   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << vcl_vec1[0] << std::endl;
+ 
+ 
+   std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
+   vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
+   viennacl::backend::finish();
+ 
+   viennacl::copy(vcl_vec1, ublas_vec2);
+   err_cnt = 0;
+   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+   {
+     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+     {
+       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+       ++err_cnt;
+       if (err_cnt > 5)
+         break;
+     }
+   }
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
+   }
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "GPU time: " << exec_time << std::endl;
+   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << vcl_vec1[0] << std::endl;
+ 
+ 
+   std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
+   vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
+   viennacl::backend::finish();
+ 
+   viennacl::copy(vcl_vec1, ublas_vec2);
+   err_cnt = 0;
+   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+   {
+     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+     {
+       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+       ++err_cnt;
+       if (err_cnt > 5)
+         break;
+     }
+   }
+ 
+   viennacl::backend::finish();
+   timer.start();
+   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+   {
+     vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
+   }
+   viennacl::backend::finish();
+   exec_time = timer.get();
+   std::cout << "GPU time: " << exec_time << std::endl;
+   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+   std::cout << vcl_vec1[0] << std::endl;
+ 
+ 
+   return EXIT_SUCCESS;
+ }
+ 
+ 
+ int main()
+ {
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "               Device Info" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   std::cout << viennacl::ocl::current_device().info() << std::endl;
+ #endif
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Benchmark :: Sparse" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   std::cout << "   -------------------------------" << std::endl;
+   std::cout << "   # benchmarking single-precision" << std::endl;
+   std::cout << "   -------------------------------" << std::endl;
+   run_benchmark<float>();
+ #ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+ #endif
+   {
+     std::cout << std::endl;
+     std::cout << "   -------------------------------" << std::endl;
+     std::cout << "   # benchmarking double-precision" << std::endl;
+     std::cout << "   -------------------------------" << std::endl;
+     run_benchmark<double>();
+   }
+   return 0;
+ }
+ 
++>>>>>>> upstream/1.5.1
diff --cc examples/tutorial/CMakeLists.txt
index 6a6b956,eacf4eb..75fb4d0
--- a/examples/tutorial/CMakeLists.txt
+++ b/examples/tutorial/CMakeLists.txt
@@@ -5,13 -14,14 +14,21 @@@ endforeach(
  
  if(ENABLE_UBLAS)
     include_directories(${Boost_INCLUDE_DIRS})
++<<<<<<< HEAD
 +   foreach(tut amg blas2 blas3 iterative iterative-ublas lanczos matrix-range power-iter qr spai sparse structured-matrices vector-range)
++=======
+    foreach(tut blas2 blas3 iterative-ublas lanczos least-squares matrix-range power-iter qr sparse vector-range)
++>>>>>>> upstream/1.5.1
        add_executable(${tut} ${tut}.cpp)
-       target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+       target_link_libraries(${tut} ${Boost_LIBRARIES})
+       if (ENABLE_OPENCL)
+         target_link_libraries(${tut} ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+         set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+       endif (ENABLE_OPENCL)
     endforeach()
 +   
 +   target_link_libraries(lanczos ${OPENCL_LIBRARIES} boost_system)
 +   target_link_libraries(power-iter ${OPENCL_LIBRARIES} boost_system)
  endif()
  
  if(ENABLE_EIGEN)
diff --cc examples/tutorial/iterative-ublas.cpp
index 73bcf18,ad6a87e..2e07dc7
--- a/examples/tutorial/iterative-ublas.cpp
+++ b/examples/tutorial/iterative-ublas.cpp
@@@ -1,166 -1,156 +1,325 @@@
++<<<<<<< HEAD
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +//
 +// include necessary system headers
 +//
 +#include <iostream>
 +
 +//
 +// Necessary to obtain a suitable performance in ublas
 +#ifndef NDEBUG
 + #define NDEBUG
 +#endif
 +
 +
 +//
 +// ublas includes
 +//
 +#include <boost/numeric/ublas/io.hpp>
 +#include <boost/numeric/ublas/triangular.hpp>
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/matrix.hpp>
 +#include <boost/numeric/ublas/matrix_proxy.hpp>
 +#include <boost/numeric/ublas/operation.hpp>
 +#include <boost/numeric/ublas/operation_sparse.hpp>
 +#include <boost/numeric/ublas/io.hpp>
 +#include <boost/numeric/ublas/lu.hpp>
 +
 +// Must be set if you want to use ViennaCL algorithms on ublas objects
 +#define VIENNACL_HAVE_UBLAS 1
 +
 +//
 +// ViennaCL includes
 +//
 +#include "viennacl/linalg/ilu.hpp"
 +#include "viennacl/linalg/cg.hpp"
 +#include "viennacl/linalg/bicgstab.hpp"
 +#include "viennacl/linalg/gmres.hpp"
 +#include "viennacl/io/matrix_market.hpp"
 +
 +// Some helper functions for this tutorial:
 +#include "Random.hpp"
 +#include "vector-io.hpp"
 +
 +/*
 +*
 +*   Tutorial:  Iterative solvers without OpenCL
 +*   
 +*/
 +using namespace boost::numeric;
 +
 +
 +int main()
 +{
 +  typedef float       ScalarType;
 +  
 +  //
 +  // Set up some ublas objects
 +  //
 +  ublas::vector<ScalarType> rhs;
 +  ublas::vector<ScalarType> rhs2;
 +  ublas::vector<ScalarType> ref_result;
 +  ublas::vector<ScalarType> result;
 +  ublas::compressed_matrix<ScalarType> ublas_matrix;
 +  
 +  //
 +  // Read system from file
 +  //
 +  #ifdef _MSC_VER
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
 +  #else
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
 +  #endif
 +  {
 +    std::cout << "Error reading Matrix file" << std::endl;
 +    return 0;
 +  }
 +  //std::cout << "done reading matrix" << std::endl;
 +
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
 +  #else
 +  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
 +  #endif
 +  {
 +    std::cout << "Error reading RHS file" << std::endl;
 +    return 0;
 +  }
 +  //std::cout << "done reading rhs" << std::endl;
 +
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
 +  #else
 +  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
 +  #endif
 +  {
 +    std::cout << "Error reading Result file" << std::endl;
 +    return 0;
 +  }
 +  //std::cout << "done reading result" << std::endl;
 +
 +  
 +  //
 +  // set up ILUT preconditioners for ViennaCL and ublas objects. Other preconditioners can also be used (see manual)
 +  // 
 +  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
 +  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
 +  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
 +                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
 +  
 +  //
 +  // Conjugate gradient solver:
 +  //
 +  std::cout << "----- CG Test -----" << std::endl;
 +
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
 +  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
 +  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilu0);
 +  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_block_ilu0);
 +  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
 +  
 +  //
 +  // Stabilized BiConjugate gradient solver:
 +  //
 +  std::cout << "----- BiCGStab Test -----" << std::endl;
 +
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilu0); //with preconditioner
 +  
 +  //
 +  // GMRES solver:
 +  //
 +  std::cout << "----- GMRES Test -----" << std::endl;
 +
 +  //
 +  // for ublas objects:
 +  //
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilu0);//with preconditioner
 +
 +  //
 +  //  That's it. 
 +  //
 +  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
 +  
 +  return 0;
 +}
 +
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /*
+ *
+ *   Tutorial:  Use of the iterative solvers in ViennaCL with Boost.uBLAS
+ *
+ */
+ 
+ //
+ // include necessary system headers
+ //
+ #include <iostream>
+ 
+ //
+ // Necessary to obtain a suitable performance in ublas
+ #ifndef NDEBUG
+  #define NDEBUG
+ #endif
+ 
+ 
+ //
+ // ublas includes
+ //
+ #include <boost/numeric/ublas/io.hpp>
+ #include <boost/numeric/ublas/triangular.hpp>
+ #include <boost/numeric/ublas/matrix_sparse.hpp>
+ #include <boost/numeric/ublas/matrix.hpp>
+ #include <boost/numeric/ublas/matrix_proxy.hpp>
+ #include <boost/numeric/ublas/operation.hpp>
+ #include <boost/numeric/ublas/operation_sparse.hpp>
+ #include <boost/numeric/ublas/io.hpp>
+ #include <boost/numeric/ublas/lu.hpp>
+ 
+ // Must be set if you want to use ViennaCL algorithms on ublas objects
+ #define VIENNACL_WITH_UBLAS 1
+ 
+ //
+ // ViennaCL includes
+ //
+ #include "viennacl/linalg/ilu.hpp"
+ #include "viennacl/linalg/cg.hpp"
+ #include "viennacl/linalg/bicgstab.hpp"
+ #include "viennacl/linalg/gmres.hpp"
+ #include "viennacl/io/matrix_market.hpp"
+ 
+ // Some helper functions for this tutorial:
+ #include "Random.hpp"
+ #include "vector-io.hpp"
+ 
+ using namespace boost::numeric;
+ 
+ 
+ int main()
+ {
+   typedef float       ScalarType;
+ 
+   //
+   // Set up some ublas objects
+   //
+   ublas::vector<ScalarType> rhs;
+   ublas::vector<ScalarType> rhs2;
+   ublas::vector<ScalarType> ref_result;
+   ublas::vector<ScalarType> result;
+   ublas::compressed_matrix<ScalarType> ublas_matrix;
+ 
+   //
+   // Read system from file
+   //
+   if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+   {
+     std::cout << "Error reading Matrix file" << std::endl;
+     return 0;
+   }
+   //std::cout << "done reading matrix" << std::endl;
+ 
+   if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
+   {
+     std::cout << "Error reading RHS file" << std::endl;
+     return 0;
+   }
+   //std::cout << "done reading rhs" << std::endl;
+ 
+   if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
+   {
+     std::cout << "Error reading Result file" << std::endl;
+     return 0;
+   }
+   //std::cout << "done reading result" << std::endl;
+ 
+ 
+   //
+   // set up ILUT preconditioners for ViennaCL and ublas objects. Other preconditioners can also be used (see manual)
+   //
+   viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+   viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+   viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+ 
+   //
+   // Conjugate gradient solver:
+   //
+   std::cout << "----- CG Test -----" << std::endl;
+ 
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
+   std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
+   std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilu0);
+   std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_block_ilu0);
+   std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+ 
+   //
+   // Stabilized BiConjugate gradient solver:
+   //
+   std::cout << "----- BiCGStab Test -----" << std::endl;
+ 
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilu0); //with preconditioner
+ 
+   //
+   // GMRES solver:
+   //
+   std::cout << "----- GMRES Test -----" << std::endl;
+ 
+   //
+   // for ublas objects:
+   //
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilu0);//with preconditioner
+ 
+   //
+   //  That's it.
+   //
+   std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+ 
+   return 0;
+ }
+ 
++>>>>>>> upstream/1.5.1
diff --cc examples/tutorial/iterative.cpp
index 2fde7ff,1efde9d..1567130
--- a/examples/tutorial/iterative.cpp
+++ b/examples/tutorial/iterative.cpp
@@@ -1,244 -1,235 +1,482 @@@
++<<<<<<< HEAD
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +//
 +// include necessary system headers
 +//
 +#include <iostream>
 +
 +//
 +// Necessary to obtain a suitable performance in ublas
 +#ifndef NDEBUG
 + #define NDEBUG
 +#endif
 +
 +//
 +// ublas includes
 +//
 +#include <boost/numeric/ublas/io.hpp>
 +#include <boost/numeric/ublas/triangular.hpp>
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/matrix.hpp>
 +#include <boost/numeric/ublas/matrix_proxy.hpp>
 +#include <boost/numeric/ublas/operation.hpp>
 +#include <boost/numeric/ublas/operation_sparse.hpp>
 +#include <boost/numeric/ublas/io.hpp>
 +#include <boost/numeric/ublas/lu.hpp>
 +
 +// Must be set if you want to use ViennaCL algorithms on ublas objects
 +#define VIENNACL_HAVE_UBLAS 1
 +
 +
 +//
 +// ViennaCL includes
 +//
 +#include "viennacl/scalar.hpp"
 +#include "viennacl/vector.hpp"
 +#include "viennacl/compressed_matrix.hpp"
 +#include "viennacl/coordinate_matrix.hpp"
 +#include "viennacl/linalg/prod.hpp"
 +#include "viennacl/linalg/ilu.hpp"
 +#include "viennacl/linalg/jacobi_precond.hpp"
 +#include "viennacl/linalg/cg.hpp"
 +#include "viennacl/linalg/bicgstab.hpp"
 +#include "viennacl/linalg/gmres.hpp"
 +#include "viennacl/io/matrix_market.hpp"
 +
 +
 +// Some helper functions for this tutorial:
 +#include "Random.hpp"
 +#include "vector-io.hpp"
 +
 +
 +/*
 +*
 +*   Tutorial:  Iterative solvers
 +*   
 +*/
 +using namespace boost::numeric;
 +
 +
 +int main()
 +{
 +  typedef float       ScalarType;
 +  
 +  //
 +  // Set up some ublas objects
 +  //
 +  ublas::vector<ScalarType> rhs;
 +  ublas::vector<ScalarType> rhs2;
 +  ublas::vector<ScalarType> ref_result;
 +  ublas::vector<ScalarType> result;
 +  ublas::compressed_matrix<ScalarType> ublas_matrix;
 +  
 +  //
 +  // Read system from file
 +  //
 +  #ifdef _MSC_VER
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
 +  #else
 +  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
 +  #endif
 +  {
 +    std::cout << "Error reading Matrix file" << std::endl;
 +    return 0;
 +  }
 +  //std::cout << "done reading matrix" << std::endl;
 +
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
 +  #else
 +  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
 +  #endif
 +  {
 +    std::cout << "Error reading RHS file" << std::endl;
 +    return 0;
 +  }
 +  //std::cout << "done reading rhs" << std::endl;
 +
 +  #ifdef _MSC_VER
 +  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
 +  #else
 +  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
 +  #endif
 +  {
 +    std::cout << "Error reading Result file" << std::endl;
 +    return 0;
 +  }
 +  //std::cout << "done reading result" << std::endl;
 +
 +  //
 +  // Set up some ViennaCL objects
 +  //
 +  size_t vcl_size = rhs.size();
 +  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
 +  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
 +  viennacl::vector<ScalarType> vcl_rhs(vcl_size); 
 +  viennacl::vector<ScalarType> vcl_result(vcl_size);
 +  viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
 +  
 +  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
 +  viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
 +  
 +  
 +  //
 +  // Transfer ublas-matrix to GPU:
 +  //
 +  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
 +  
 +  //
 +  // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
 +  //
 +  std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
 +  for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
 +       iter1 != ublas_matrix.end1();
 +       ++iter1)
 +  {
 +    for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
 +         iter2 != iter1.end();
 +         ++iter2)
 +         stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
 +  }
 +  viennacl::copy(stl_matrix, vcl_coordinate_matrix);
 +  viennacl::copy(vcl_coordinate_matrix, stl_matrix);
 +  
 +  //
 +  // set up ILUT preconditioners for ublas and ViennaCL objects:
 +  // 
 +  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
 +  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
 +  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
 +                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
 +  
 +  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
 +  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
 +  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
 +                                       viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
 +
 +  //
 +  // set up Jacobi preconditioners for ViennaCL and ublas objects:
 +  // 
 +  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
 +  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
 +  
 +  //
 +  // Conjugate gradient solver:
 +  //
 +  std::cout << "----- CG Test -----" << std::endl;
 +  
 +  //
 +  // for ublas objects:
 +  //
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
 +
 +  
 +  //
 +  // for ViennaCL objects:
 +  //
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
 +  
 +  //
 +  // Stabilized BiConjugate gradient solver:
 +  //
 +  std::cout << "----- BiCGStab Test -----" << std::endl;
 +
 +  //
 +  // for ublas objects:
 +  //
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
 +
 +  
 +  //
 +  // for ViennaCL objects:
 +  //
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
 +  
 +  //
 +  // GMRES solver:
 +  //
 +  std::cout << "----- GMRES Test -----" << std::endl;
 +  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs when using Stream SDK v2.1." << std::endl;
 +
 +  //
 +  // for ublas objects:
 +  //
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
 +  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
 +
 +  //
 +  // for ViennaCL objects:
 +  //
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
 +  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
 +
 +  //
 +  //  That's it.
 +  //
 +  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
 +  
 +  return 0;
 +}
 +
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /*
+ *
+ *   Tutorial:  Iterative solvers in ViennaCL (iterative.cpp and iterative.cu are identical, the latter being required for compilation using CUDA nvcc)
+ *
+ */
+ 
+ //
+ // include necessary system headers
+ //
+ #include <iostream>
+ 
+ //
+ // Necessary to obtain a suitable performance in ublas
+ #ifndef NDEBUG
+  #define NDEBUG
+ #endif
+ 
+ //
+ // ublas includes
+ //
+ #include <boost/numeric/ublas/io.hpp>
+ #include <boost/numeric/ublas/triangular.hpp>
+ #include <boost/numeric/ublas/matrix_sparse.hpp>
+ #include <boost/numeric/ublas/matrix.hpp>
+ #include <boost/numeric/ublas/matrix_proxy.hpp>
+ #include <boost/numeric/ublas/operation.hpp>
+ #include <boost/numeric/ublas/operation_sparse.hpp>
+ #include <boost/numeric/ublas/io.hpp>
+ #include <boost/numeric/ublas/lu.hpp>
+ 
+ // Must be set if you want to use ViennaCL algorithms on ublas objects
+ #define VIENNACL_WITH_UBLAS 1
+ 
+ 
+ //
+ // ViennaCL includes
+ //
+ #include "viennacl/scalar.hpp"
+ #include "viennacl/vector.hpp"
+ #include "viennacl/compressed_matrix.hpp"
+ #include "viennacl/coordinate_matrix.hpp"
+ #include "viennacl/linalg/prod.hpp"
+ #include "viennacl/linalg/ilu.hpp"
+ #include "viennacl/linalg/jacobi_precond.hpp"
+ #include "viennacl/linalg/cg.hpp"
+ #include "viennacl/linalg/bicgstab.hpp"
+ #include "viennacl/linalg/gmres.hpp"
+ #include "viennacl/io/matrix_market.hpp"
+ 
+ 
+ // Some helper functions for this tutorial:
+ #include "Random.hpp"
+ #include "vector-io.hpp"
+ 
+ 
+ using namespace boost::numeric;
+ 
+ 
+ int main()
+ {
+   typedef float       ScalarType;
+ 
+   //
+   // Set up some ublas objects
+   //
+   ublas::vector<ScalarType> rhs;
+   ublas::vector<ScalarType> rhs2;
+   ublas::vector<ScalarType> ref_result;
+   ublas::vector<ScalarType> result;
+   ublas::compressed_matrix<ScalarType> ublas_matrix;
+ 
+   //
+   // Read system from file
+   //
+   if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+   {
+     std::cout << "Error reading Matrix file" << std::endl;
+     return 0;
+   }
+   //std::cout << "done reading matrix" << std::endl;
+ 
+   if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
+   {
+     std::cout << "Error reading RHS file" << std::endl;
+     return 0;
+   }
+   //std::cout << "done reading rhs" << std::endl;
+ 
+   if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
+   {
+     std::cout << "Error reading Result file" << std::endl;
+     return 0;
+   }
+   //std::cout << "done reading result" << std::endl;
+ 
+   //
+   // Set up some ViennaCL objects
+   //
+   std::size_t vcl_size = rhs.size();
+   viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
+   viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
+   viennacl::vector<ScalarType> vcl_rhs(vcl_size);
+   viennacl::vector<ScalarType> vcl_result(vcl_size);
+   viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
+ 
+   viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+   viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
+ 
+ 
+   //
+   // Transfer ublas-matrix to GPU:
+   //
+   viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+ 
+   //
+   // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
+   //
+   std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
+   for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
+        iter1 != ublas_matrix.end1();
+        ++iter1)
+   {
+     for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
+          iter2 != iter1.end();
+          ++iter2)
+          stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
+   }
+   viennacl::copy(stl_matrix, vcl_coordinate_matrix);
+   viennacl::copy(vcl_coordinate_matrix, stl_matrix);
+ 
+   //
+   // set up ILUT preconditioners for ublas and ViennaCL objects:
+   //
+   std::cout << "Setting up preconditioners for uBLAS-matrix..." << std::endl;
+   viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+   viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+   viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+ 
+   std::cout << "Setting up preconditioners for ViennaCL-matrix..." << std::endl;
+   viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+   viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+   viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                        viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+ 
+   //
+   // set up Jacobi preconditioners for ViennaCL and ublas objects:
+   //
+   viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
+   viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
+ 
+   //
+   // Conjugate gradient solver:
+   //
+   std::cout << "----- CG Test -----" << std::endl;
+ 
+   //
+   // for ublas objects:
+   //
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
+ 
+ 
+   //
+   // for ViennaCL objects:
+   //
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
+ 
+   //
+   // Stabilized BiConjugate gradient solver:
+   //
+   std::cout << "----- BiCGStab Test -----" << std::endl;
+ 
+   //
+   // for ublas objects:
+   //
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
+ 
+ 
+   //
+   // for ViennaCL objects:
+   //
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
+ 
+   //
+   // GMRES solver:
+   //
+   std::cout << "----- GMRES Test -----" << std::endl;
+ 
+   //
+   // for ublas objects:
+   //
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
+   result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
+ 
+   //
+   // for ViennaCL objects:
+   //
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
+   vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
+ 
+   //
+   //  That's it.
+   //
+   std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+ 
+   return 0;
+ }
+ 
++>>>>>>> upstream/1.5.1
diff --cc examples/tutorial/lanczos.cpp
index 53d605e,9ac94e9..aabec25
--- a/examples/tutorial/lanczos.cpp
+++ b/examples/tutorial/lanczos.cpp
@@@ -1,19 -1,26 +1,39 @@@
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
++=======
+ /*
+ *
+ *   Tutorial: Calculation of eigenvalues using Lanczos' method (lanczos.cpp and lanczos.cu are identical, the latter being required for compilation using CUDA nvcc)
+ *
+ */
+ 
++>>>>>>> upstream/1.5.1
  // include necessary system headers
  #include <iostream>
  
@@@ -21,7 -28,7 +41,11 @@@
    #define NDEBUG
  #endif
  
++<<<<<<< HEAD
 +#define VIENNACL_HAVE_UBLAS
++=======
+ #define VIENNACL_WITH_UBLAS
++>>>>>>> upstream/1.5.1
  
  //include basic scalar and vector types of ViennaCL
  #include "viennacl/scalar.hpp"
@@@ -42,14 -49,10 +66,21 @@@
  #include <boost/numeric/ublas/matrix_expression.hpp>
  #include <boost/numeric/ublas/matrix_sparse.hpp>
  #include <boost/numeric/ublas/vector.hpp>
++<<<<<<< HEAD
 +#include <boost/numeric/ublas/operation.hpp> 
 +#include <boost/numeric/ublas/vector_expression.hpp>
 +#include <boost/filesystem.hpp>
 +
 +
 +/*
 +*   Tutorial: calculation of eigenvalues - lanczos and poweriteration
 +*/
++=======
+ #include <boost/numeric/ublas/operation.hpp>
+ #include <boost/numeric/ublas/vector_expression.hpp>
+ 
+ 
++>>>>>>> upstream/1.5.1
  
  template <typename MatrixType>
  std::vector<double> initEig(MatrixType const & A)
@@@ -57,9 -60,9 +88,15 @@@
    viennacl::linalg::lanczos_tag ltag(0.75, 10, viennacl::linalg::lanczos_tag::partial_reorthogonalization, 1700);
    std::vector<double> lanczos_eigenvalues = viennacl::linalg::eig(A, ltag);
    for(std::size_t i = 0; i< lanczos_eigenvalues.size(); i++){
++<<<<<<< HEAD
 +          std::cout << "Eigenvalue " << i+1 << ": " << std::setprecision(10) << lanczos_eigenvalues[i] << std::endl; 
 +  }
 +  
++=======
+           std::cout << "Eigenvalue " << i+1 << ": " << std::setprecision(10) << lanczos_eigenvalues[i] << std::endl;
+   }
+ 
++>>>>>>> upstream/1.5.1
    return lanczos_eigenvalues;
  }
  
@@@ -67,18 -70,16 +104,29 @@@
  int main()
  {
    typedef double     ScalarType;
++<<<<<<< HEAD
 +  
 +  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
 +
 +  viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());  
 +  viennacl::copy(ublas_A, vcl_A);
 +  
++=======
+ 
+   boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
+ 
++>>>>>>> upstream/1.5.1
    if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
    {
      std::cout << "Error reading Matrix file" << std::endl;
      return 0;
    }
++<<<<<<< HEAD
 +  
++=======
+ 
+   std::cout << "Running Lanczos algorithm (this might take a while)..." << std::endl;
++>>>>>>> upstream/1.5.1
    std::vector<double> eigenvalues = initEig(ublas_A);
  }
  
diff --cc examples/tutorial/power-iter.cpp
index e6d4927,3028ca7..20d09ff
--- a/examples/tutorial/power-iter.cpp
+++ b/examples/tutorial/power-iter.cpp
@@@ -1,19 -1,27 +1,40 @@@
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
++=======
+ /*
+ *
+ *   Tutorial: Calculation of the eigenvalue with largest modulus using the power iteration method
+ *             (power-iter.cpp and power-iter.cu are identical, the latter being required for compilation using CUDA nvcc)
+ *
+ */
+ 
++>>>>>>> upstream/1.5.1
  // include necessary system headers
  #include <iostream>
  
@@@ -21,7 -29,7 +42,11 @@@
    #define NDEBUG
  #endif
  
++<<<<<<< HEAD
 +#define VIENNACL_HAVE_UBLAS
++=======
+ #define VIENNACL_WITH_UBLAS
++>>>>>>> upstream/1.5.1
  
  //include basic scalar and vector types of ViennaCL
  #include "viennacl/scalar.hpp"
@@@ -41,21 -49,15 +66,30 @@@
  #include <boost/numeric/ublas/matrix_expression.hpp>
  #include <boost/numeric/ublas/matrix_sparse.hpp>
  #include <boost/numeric/ublas/vector.hpp>
++<<<<<<< HEAD
 +#include <boost/numeric/ublas/operation.hpp> 
 +#include <boost/numeric/ublas/vector_expression.hpp>
 +#include <boost/filesystem.hpp>
 +
 +
 +/*
 +*   Tutorial: Power Iteration for finding the eigenvalue with largest modulus
 +*/
++=======
+ #include <boost/numeric/ublas/operation.hpp>
+ #include <boost/numeric/ublas/vector_expression.hpp>
++>>>>>>> upstream/1.5.1
  
  
  
  int main()
  {
    typedef double     ScalarType;
++<<<<<<< HEAD
 +  
++=======
+ 
++>>>>>>> upstream/1.5.1
    boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
  
    if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
@@@ -63,15 -65,15 +97,27 @@@
      std::cout << "Error reading Matrix file" << std::endl;
      return 0;
    }
++<<<<<<< HEAD
 +  
 +  viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());  
 +  viennacl::copy(ublas_A, vcl_A);
 +  
 +  viennacl::linalg::power_iter_tag ptag(1e-8);
++=======
+ 
+   viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());
+   viennacl::copy(ublas_A, vcl_A);
+ 
+   viennacl::linalg::power_iter_tag ptag(1e-6);
++>>>>>>> upstream/1.5.1
  
    std::cout << "Starting computation of eigenvalue with largest modulus (might take about a minute)..." << std::endl;
    std::cout << "Result of power iteration with ublas matrix (single-threaded): " << viennacl::linalg::eig(ublas_A, ptag) << std::endl;
    std::cout << "Result of power iteration with ViennaCL (OpenCL accelerated): " << viennacl::linalg::eig(vcl_A, ptag) << std::endl;
++<<<<<<< HEAD
 +  
++=======
+ 
++>>>>>>> upstream/1.5.1
  }
  
diff --cc examples/tutorial/qr.cpp
index 2f3b629,7b7bc77..c143737
--- a/examples/tutorial/qr.cpp
+++ b/examples/tutorial/qr.cpp
@@@ -95,7 -98,7 +98,11 @@@ int main (int, const char **
  
    std::size_t rows = 113;   //number of rows in the matrix
    std::size_t cols = 54;   //number of columns
++<<<<<<< HEAD
 +  
++=======
+ 
++>>>>>>> upstream/1.5.1
    //
    // Create matrices with some data
    //
@@@ -148,7 -151,7 +155,11 @@@
    MatrixType ublas_QR = prod(Q, R);
    double ublas_error = check(ublas_QR, ublas_A_backup);
    std::cout << "Max rel error (ublas): " << ublas_error << std::endl;
++<<<<<<< HEAD
 +  
++=======
+ 
++>>>>>>> upstream/1.5.1
    //
    // QR factorization in ViennaCL using Boost.uBLAS for the panel factorization
    //
diff --cc tests/CMakeLists.txt
index fde3e3d,dfc29ab..b8114fd
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@@ -1,16 -1,115 +1,126 @@@
++<<<<<<< HEAD
 +foreach(PROG blas3 blas3range fft iterators
 +        generator_inner_product generator_matrix generator_matrix_vector_product generator_vector
 +        matrix matrix_range matrix_slice nmf
 +        scalar sparse structured-matrices svd
 +        vector vector_range vector_slice)
 +   add_executable(${PROG}-test src/${PROG}.cpp)
 +   target_link_libraries(${PROG}-test ${OPENCL_LIBRARIES})
 +   add_test(${PROG} ${PROG}-test)
++=======
+ 
+ include_directories(${Boost_INCLUDE_DIRS})
+ 
+ # tests with CPU backend
+ foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_double iterators
+              global_variables
+              matrix_vector matrix_vector_int
+              matrix_row_float matrix_row_double matrix_row_int
+              matrix_col_float matrix_col_double matrix_col_int
+              scalar scheduler_matrix scheduler_matrix_matrix scheduler_matrix_vector scheduler_sparse scheduler_vector sparse
+              vector_float vector_double vector_int vector_uint vector_multi_inner_prod
+              spmdm)
+    add_executable(${PROG}-test-cpu src/${PROG}.cpp)
+    target_link_libraries(${PROG}-test-cpu ${Boost_LIBRARIES})
+    add_test(${PROG}-cpu ${PROG}-test-cpu)
++>>>>>>> upstream/1.5.1
  endforeach(PROG)
  
- include_directories(${PROJECT_SOURCE_DIR}/external)
- add_executable(external_linkage 
-                 src/external_1.cpp
-                 src/external_2.cpp 
-                 ${PROJECT_SOURCE_DIR}/external/pugixml/src/pugixml.cpp)
- target_link_libraries(external_linkage ${OPENCL_LIBRARIES})
+ 
+ # tests with OpenCL backend
+ if (ENABLE_OPENCL)
+   foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_double fft iterators
+                generator_blas1 generator_blas2 generator_blas3 #generator_segmentation
+                global_variables
+                matrix_vector matrix_vector_int
+                matrix_row_float matrix_row_double matrix_row_int
+                matrix_col_float matrix_col_double matrix_col_int
+                nmf qr_method
+                scalar sparse structured-matrices svd
+                vector_float vector_double vector_int vector_uint vector_multi_inner_prod
+                spmdm)
+      add_executable(${PROG}-test-opencl src/${PROG}.cpp)
+      target_link_libraries(${PROG}-test-opencl ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+      add_test(${PROG}-opencl ${PROG}-test-opencl)
+      set_target_properties(${PROG}-test-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+   endforeach(PROG)
+ 
+   include_directories(${PROJECT_SOURCE_DIR}/external)
+   add_executable(external_linkage-opencl
+                  src/external_1.cpp
+                  src/external_2.cpp)
+   target_link_libraries(external_linkage-opencl ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+   set_target_properties(external_linkage-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+ endif (ENABLE_OPENCL)
+ 
+ # tests with CUDA backend
+ if (ENABLE_CUDA)
+   foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_double iterators
+                global_variables
+                matrix_vector matrix_vector_int
+                matrix_row_float matrix_row_double matrix_row_int
+                matrix_col_float matrix_col_double matrix_col_int
+                scalar sparse
+                vector_float vector_double vector_int vector_uint vector_multi_inner_prod
+                spmdm)
+      cuda_add_executable(${PROG}-test-cuda src/${PROG}.cu)
+      target_link_libraries(${PROG}-test-cuda ${Boost_LIBRARIES})
+      add_test(${PROG}-cuda ${PROG}-test-cuda)
+   endforeach(PROG)
+ 
+   include_directories(${PROJECT_SOURCE_DIR}/external)
+   cuda_add_executable(external_linkage-cuda
+                       src/external_1.cu
+                       src/external_2.cu)
+   target_link_libraries(external_linkage-cuda ${Boost_LIBRARIES})
+ endif (ENABLE_CUDA)
+ 
+ # test shared library
+ include_directories(${PROJECT_SOURCE_DIR}/libviennacl/include/)
+ 
+ if(ENABLE_CUDA)
+   if(ENABLE_OPENCL)
+     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DVIENNACL_WITH_OPENCL") #set flags before setting executable!
+     cuda_add_executable(libviennacl_blas1-test src/libviennacl_blas1.cu)
+     target_link_libraries(libviennacl_blas1-test viennacl ${OPENCL_LIBRARIES})
+ 
+     cuda_add_executable(libviennacl_blas2-test src/libviennacl_blas2.cu)
+     target_link_libraries(libviennacl_blas2-test viennacl ${OPENCL_LIBRARIES})
+ 
+     cuda_add_executable(libviennacl_blas3-test src/libviennacl_blas3.cu)
+     target_link_libraries(libviennacl_blas3-test viennacl ${OPENCL_LIBRARIES})
+ 
+   else(ENABLE_OPENCL)
+     cuda_add_executable(libviennacl_blas1-test src/libviennacl_blas1.cu)
+     target_link_libraries(libviennacl_blas1-test viennacl)
+ 
+     cuda_add_executable(libviennacl_blas2-test src/libviennacl_blas2.cu)
+     target_link_libraries(libviennacl_blas2-test viennacl)
+ 
+     cuda_add_executable(libviennacl_blas3-test src/libviennacl_blas3.cu)
+     target_link_libraries(libviennacl_blas3-test viennacl)
+   endif(ENABLE_OPENCL)
+ else(ENABLE_CUDA)
+   add_executable(libviennacl_blas1-test src/libviennacl_blas1.cpp)
+   add_executable(libviennacl_blas2-test src/libviennacl_blas2.cpp)
+   add_executable(libviennacl_blas3-test src/libviennacl_blas3.cpp)
+   if(ENABLE_OPENCL)
+     set_target_properties(libviennacl_blas1-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+     target_link_libraries(libviennacl_blas1-test viennacl ${OPENCL_LIBRARIES})
+ 
+     set_target_properties(libviennacl_blas2-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+     target_link_libraries(libviennacl_blas2-test viennacl ${OPENCL_LIBRARIES})
+ 
+     set_target_properties(libviennacl_blas3-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+     target_link_libraries(libviennacl_blas3-test viennacl ${OPENCL_LIBRARIES})
+   else(ENABLE_OPENCL)
+     target_link_libraries(libviennacl_blas1-test viennacl)
+     target_link_libraries(libviennacl_blas2-test viennacl)
+     target_link_libraries(libviennacl_blas3-test viennacl)
+   endif(ENABLE_OPENCL)
+ endif(ENABLE_CUDA)
+ add_test(libviennacl-blas1 libviennacl_blas1-test)
+ add_test(libviennacl-blas2 libviennacl_blas2-test)
+ add_test(libviennacl-blas3 libviennacl_blas3-test)
+ 
+ 
diff --cc tests/src/blas3_solve_double.cpp
index 569dea9,e063f79..be81971
--- a/tests/src/blas3_solve_double.cpp
+++ b/tests/src/blas3_solve_double.cpp
@@@ -78,136 -83,36 +83,151 @@@ ScalarType diff(ublas::vector<ScalarTyp
     return norm_inf(v2_cpu);
  }
  
- template <typename ScalarType, typename F, unsigned int ALIGNMENT>
- ScalarType diff(ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2)
+ 
+ template <typename ScalarType, typename VCLMatrixType>
+ ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
  {
     ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
++<<<<<<< HEAD:tests/src/blas3.cpp
 +   viennacl::copy(mat2, mat2_cpu);
 +   double ret = 0;
 +   double act = 0;
++=======
+    viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+    viennacl::copy(mat2, mat2_cpu);
+    ScalarType ret = 0;
+    ScalarType act = 0;
++>>>>>>> upstream/1.5.1:tests/src/blas3_solve_double.cpp
  
-     for (std::size_t i = 0; i < mat2_cpu.size1(); ++i)
+     for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
      {
-       for (std::size_t j = 0; j < mat2_cpu.size2(); ++j)
+       for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
        {
-          act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
+          act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
           if (act > ret)
             ret = act;
 +         if (act > 0.1)
 +         {
 +           std::cout << "Offending index: " << i << ", " << j << std::endl;
 +           exit(0);
 +         }
        }
      }
     //std::cout << ret << std::endl;
-    return ScalarType(ret);
+    return ret;
  }
  
+ 
+ 
  //
- // -------------------------------------------------------------
+ // Triangular solvers
  //
++<<<<<<< HEAD:tests/src/blas3.cpp
 +template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename Epsilon >
 +int test_prod(Epsilon const& epsilon)
 +{
 +   int retval = EXIT_SUCCESS;
 +   //long matrix_size1 = 157;  //some odd number, not too large
 +   //long matrix_size2 = 91;  //some odd number, not too large
 +   //long matrix_size3 = 73;  //some odd number, not too large
 +   long matrix_size1 = 128;  //some odd number, not too large
 +   long matrix_size2 = 64;  //some odd number, not too large
 +   long matrix_size3 = 128;  //some odd number, not too large
 +   NumericT act_diff = 0;
 +   
 +   // --------------------------------------------------------------------------            
 +   ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
 +   ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
 +   ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
 +
 +   //fill A and B:
 +   for (unsigned int i = 0; i < A.size1(); ++i)
 +      for (unsigned int j = 0; j < A.size2(); ++j)
 +         A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
 +   for (unsigned int i = 0; i < B.size1(); ++i)
 +      for (unsigned int j = 0; j < B.size2(); ++j)
 +         B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
 +
 +   ublas::matrix<NumericT> A_trans = trans(A);
 +   ublas::matrix<NumericT> B_trans = trans(B);
 +   
 +   MatrixTypeA vcl_A(matrix_size1, matrix_size2);
 +   MatrixTypeB vcl_B(matrix_size2, matrix_size3);
 +   MatrixTypeA vcl_A_trans(matrix_size2, matrix_size1);
 +   MatrixTypeB vcl_B_trans(matrix_size3, matrix_size2);
 +   MatrixTypeC vcl_C(matrix_size1, matrix_size3);
 +
 +   
 +   viennacl::copy(A, vcl_A);
 +   viennacl::copy(B, vcl_B);
 +   viennacl::copy(A_trans, vcl_A_trans);
 +   viennacl::copy(B_trans, vcl_B_trans);
 +
 +   // Test: C = A * B --------------------------------------------------------------------------       
 +   C     = viennacl::linalg::prod(A, B);
 +   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
 +   act_diff = fabs(diff(C, vcl_C));
 +   
 +   if( act_diff > epsilon )
 +   {
 +     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
 +     std::cout << "  diff: " << act_diff << std::endl;
 +     retval = EXIT_FAILURE;
 +   }
 +   else
 +     std::cout << "Test C = A * B passed!" << std::endl;
 +   
 +   // Test: C = A * trans(B) --------------------------------------------------------------------------       
 +   C     = boost::numeric::ublas::prod(A, trans(B_trans));
 +   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
 +   act_diff = fabs(diff(C, vcl_C));
 +   
 +   if( act_diff > epsilon )
 +   {
 +     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
 +     std::cout << "  diff: " << act_diff << std::endl;
 +     retval = EXIT_FAILURE;
 +   }
 +   else
 +     std::cout << "Test C = A * trans(B) passed!" << std::endl;
 +   
 +   // Test: C = trans(A) * B --------------------------------------------------------------------------       
 +   C     = boost::numeric::ublas::prod(trans(A_trans), B);
 +   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
 +   act_diff = fabs(diff(C, vcl_C));
 +   
 +   if( act_diff > epsilon )
 +   {
 +     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
 +     std::cout << "  diff: " << act_diff << std::endl;
 +     retval = EXIT_FAILURE;
 +   }
 +   else
 +     std::cout << "Test C = trans(A) * B passed!" << std::endl;
 +   
 +   
 +   // Test: C = trans(A) * trans(B) --------------------------------------------------------------------------       
 +   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
 +   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
 +   act_diff = fabs(diff(C, vcl_C));
 +   
 +   if( act_diff > epsilon )
 +   {
 +     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
 +     std::cout << "  diff: " << act_diff << std::endl;
 +     retval = EXIT_FAILURE;
 +   }
 +   else
 +     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
 +   
 +   
 +   
 +   return retval;
 +}
++=======
+ 
+ 
++>>>>>>> upstream/1.5.1:tests/src/blas3_solve_double.cpp
  
  template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
  void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
diff --cc tests/src/external_1.cpp
index 328c15c,68504ff..1c52375
--- a/tests/src/external_1.cpp
+++ b/tests/src/external_1.cpp
@@@ -38,10 -38,12 +38,19 @@@
  #include "viennacl/coordinate_matrix.hpp"
  #include "viennacl/ell_matrix.hpp"
  #include "viennacl/hyb_matrix.hpp"
++<<<<<<< HEAD
 +#include "viennacl/circulant_matrix.hpp"
 +#include "viennacl/hankel_matrix.hpp"
 +#include "viennacl/toeplitz_matrix.hpp"
 +#include "viennacl/vandermonde_matrix.hpp"
++=======
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/circulant_matrix.hpp"
+   #include "viennacl/hankel_matrix.hpp"
+   #include "viennacl/toeplitz_matrix.hpp"
+   #include "viennacl/vandermonde_matrix.hpp"
+ #endif
++>>>>>>> upstream/1.5.1
  
  #include "viennacl/linalg/ilu.hpp"
  #include "viennacl/linalg/row_scaling.hpp"
@@@ -53,16 -53,21 +60,24 @@@
  #include "viennacl/linalg/gmres.hpp"
  #include "viennacl/linalg/direct_solve.hpp"
  #include "viennacl/linalg/qr.hpp"
 +#include "viennacl/linalg/svd.hpp"
  
- #include "viennacl/fft.hpp"
  #include "viennacl/misc/bandwidth_reduction.hpp"
  
- #include "viennacl/io/kernel_parameters.hpp"
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/amg.hpp"
+   #include "viennacl/linalg/spai.hpp"
+   #include "viennacl/linalg/svd.hpp"
+   #include "viennacl/fft.hpp"
+   #include "viennacl/generator/generate.hpp"
+ #endif
+ 
  #include "viennacl/io/matrix_market.hpp"
+ #include "viennacl/scheduler/execute.hpp"
+ 
  
 +#include "viennacl/generator/custom_operation.hpp"
 +
  
  //defined in external_2.cpp
  void other_func();
diff --cc tests/src/external_2.cpp
index 62e5739,bc3c34f..67dc830
--- a/tests/src/external_2.cpp
+++ b/tests/src/external_2.cpp
@@@ -35,10 -38,12 +38,19 @@@
  #include "viennacl/coordinate_matrix.hpp"
  #include "viennacl/ell_matrix.hpp"
  #include "viennacl/hyb_matrix.hpp"
++<<<<<<< HEAD
 +#include "viennacl/circulant_matrix.hpp"
 +#include "viennacl/hankel_matrix.hpp"
 +#include "viennacl/toeplitz_matrix.hpp"
 +#include "viennacl/vandermonde_matrix.hpp"
++=======
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/circulant_matrix.hpp"
+   #include "viennacl/hankel_matrix.hpp"
+   #include "viennacl/toeplitz_matrix.hpp"
+   #include "viennacl/vandermonde_matrix.hpp"
+ #endif
++>>>>>>> upstream/1.5.1
  
  #include "viennacl/linalg/ilu.hpp"
  #include "viennacl/linalg/row_scaling.hpp"
@@@ -48,16 -53,20 +60,26 @@@
  #include "viennacl/linalg/gmres.hpp"
  #include "viennacl/linalg/direct_solve.hpp"
  #include "viennacl/linalg/qr.hpp"
++<<<<<<< HEAD
 +#include "viennacl/linalg/svd.hpp"
++=======
++>>>>>>> upstream/1.5.1
  
- #include "viennacl/fft.hpp"
  #include "viennacl/misc/bandwidth_reduction.hpp"
  
- #include "viennacl/io/kernel_parameters.hpp"
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/amg.hpp"
+   #include "viennacl/linalg/spai.hpp"
+   #include "viennacl/linalg/svd.hpp"
+   #include "viennacl/fft.hpp"
+   #include "viennacl/generator/generate.hpp"
+ #endif
+ 
  #include "viennacl/io/matrix_market.hpp"
+ #include "viennacl/scheduler/execute.hpp"
  
 +#include "viennacl/generator/custom_operation.hpp"
 +
  void other_func()
  {
    typedef float   NumericType;
diff --cc tests/src/nmf.cpp
index 7e68f26,5be2b4d..30aae67
--- a/tests/src/nmf.cpp
+++ b/tests/src/nmf.cpp
@@@ -1,17 -1,33 +1,50 @@@
++<<<<<<< HEAD
 +#include <ctime>
 +#include <cmath>
 +
 +#include "viennacl/linalg/prod.hpp"
 +#include "viennacl/linalg/nmf.hpp"
 +
 +#include "examples/benchmarks/benchmark-utils.hpp"
 +
 +typedef float ScalarType;
 +
 +const ScalarType EPS = 0.1;
 +
 +float matrix_compare(viennacl::matrix<ScalarType>& res,
 +                     viennacl::matrix<ScalarType>& ref) 
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ #include <ctime>
+ #include <cmath>
+ 
+ 
+ #include "viennacl/linalg/prod.hpp"
+ #include "viennacl/linalg/nmf.hpp"
+ 
+ typedef float ScalarType;
+ 
+ const ScalarType EPS = ScalarType(0.1);
+ 
+ float matrix_compare(viennacl::matrix<ScalarType>& res,
+                      viennacl::matrix<ScalarType>& ref)
++>>>>>>> upstream/1.5.1
  {
      std::vector<ScalarType> res_std(res.internal_size());
      std::vector<ScalarType> ref_std(ref.internal_size());
@@@ -30,60 -46,75 +63,130 @@@
      return diff / mx;
  }
  
++<<<<<<< HEAD
 +void fill_random(std::vector<ScalarType>& v)
 +{
 +    for(std::size_t j = 0; j < v.size(); j++)
 +        v[j] = static_cast<ScalarType>(rand()) / RAND_MAX;
 +}
 +
 +void test_nmf(std::size_t m, std::size_t k, std::size_t n)
 +{
 +    std::vector<ScalarType> stl_w(m * k);
 +    std::vector<ScalarType> stl_h(k * n);
++=======
+ 
+ void fill_random(std::vector< std::vector<ScalarType> >& v)
+ {
+     for(std::size_t i = 0; i < v.size(); i++)
+     {
+       for (std::size_t j = 0; j < v[i].size(); ++j)
+         v[i][j] = static_cast<ScalarType>(rand()) / RAND_MAX;
+     }
+ }
+ 
+ 
+ void test_nmf(std::size_t m, std::size_t k, std::size_t n)
+ {
+     std::vector< std::vector<ScalarType> > stl_w(m, std::vector<ScalarType>(k));
+     std::vector< std::vector<ScalarType> > stl_h(k, std::vector<ScalarType>(n));
++>>>>>>> upstream/1.5.1
  
      viennacl::matrix<ScalarType> v_ref(m, n);
      viennacl::matrix<ScalarType> w_ref(m, k);
      viennacl::matrix<ScalarType> h_ref(k, n);
  
++<<<<<<< HEAD
 +    viennacl::matrix<ScalarType> v_nmf(m, n);
 +    viennacl::matrix<ScalarType> w_nmf(m, k);
 +    viennacl::matrix<ScalarType> h_nmf(k, n);
 +
 +    fill_random(stl_w);
 +    fill_random(stl_h);
 +
 +    viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w_ref);
 +    viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h_ref);
 +
 +    v_ref = viennacl::linalg::prod(w_ref, h_ref);
 +
 +    viennacl::ocl::get_queue().finish();
 +
 +    //Timer timer;
 +    //timer.start();
 +
 +    viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, k);
 +    viennacl::ocl::get_queue().finish();
 +
 +    //double time_spent = timer.get();
 +
 +    v_nmf = viennacl::linalg::prod(w_nmf, h_nmf);
++=======
+     fill_random(stl_w);
+     fill_random(stl_h);
+ 
+     viennacl::copy(stl_w, w_ref);
+     viennacl::copy(stl_h, h_ref);
+ 
+     v_ref = viennacl::linalg::prod(w_ref, h_ref);  //reference
+ 
+     // Fill again with random numbers:
+     fill_random(stl_w);
+     fill_random(stl_h);
+ 
+     viennacl::matrix<ScalarType> w_nmf(m, k);
+     viennacl::matrix<ScalarType> h_nmf(k, n);
+ 
+     viennacl::copy(stl_w, w_nmf);
+     viennacl::copy(stl_h, h_nmf);
+ 
+     viennacl::linalg::nmf_config conf;
+     conf.print_relative_error(true);
+     conf.max_iterations(5000); //5000 iterations are enough for the test
+     viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, conf);
+ 
+     viennacl::matrix<ScalarType> v_nmf = viennacl::linalg::prod(w_nmf, h_nmf);
++>>>>>>> upstream/1.5.1
  
      float diff  = matrix_compare(v_ref, v_nmf);
      bool diff_ok = fabs(diff) < EPS;
  
++<<<<<<< HEAD
 +    printf("%6s [%lux%lux%lu] diff = %.6f\n", diff_ok?"[[OK]]":"[FAIL]", m, k, n, diff);
++=======
+     long iterations = static_cast<long>(conf.iters());
+     printf("%6s [%lux%lux%lu] diff = %.6f (%ld iterations)\n", diff_ok ? "[[OK]]":"[FAIL]", m, k, n, diff, iterations);
+ 
+     if (!diff_ok)
+       exit(EXIT_FAILURE);
++>>>>>>> upstream/1.5.1
  }
  
  int main()
  {
++<<<<<<< HEAD
 +    srand(time(NULL));
 +
 +    test_nmf(3, 3, 3);
 +    test_nmf(3, 2, 3);
 +    test_nmf(16, 7, 12);
 +    test_nmf(160, 73, 200);
 +    test_nmf(1000, 15, 1000);
 +
 +    return 0;
++=======
+   //srand(time(NULL));  //let's use deterministic tests, so keep the default srand() initialization
+ 
+   test_nmf(3, 3, 3);
+   test_nmf(3, 2, 3);
+   test_nmf(16, 7, 12);
+   test_nmf(140, 73, 180);
+   test_nmf(427, 21, 523);
+ 
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+ 
+ 
+   return EXIT_SUCCESS;
++>>>>>>> upstream/1.5.1
  }
diff --cc tests/src/sparse.cpp
index 8ccca8d,4d07000..efe956c
--- a/tests/src/sparse.cpp
+++ b/tests/src/sparse.cpp
@@@ -221,142 -386,442 +386,539 @@@ int resize_test(Epsilon const& epsilon
  template< typename NumericT, typename Epsilon >
  int test(Epsilon const& epsilon)
  {
-    std::cout << "Testing resizing of compressed_matrix..." << std::endl;
-    int retval = resize_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon);
-    std::cout << "Testing resizing of coordinate_matrix..." << std::endl;
-    if (retval != EXIT_FAILURE)
-      retval = resize_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon);
-    
-    // --------------------------------------------------------------------------            
-    ublas::vector<NumericT> rhs;
-    ublas::vector<NumericT> result;
-    ublas::compressed_matrix<NumericT> ublas_matrix;
- 
-     if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
+   std::cout << "Testing resizing of compressed_matrix..." << std::endl;
+   int retval = resize_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon);
+   if (retval != EXIT_SUCCESS)
+     return retval;
+   std::cout << "Testing resizing of coordinate_matrix..." << std::endl;
+   //if (retval != EXIT_FAILURE)
+   //  retval = resize_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon);
+   //else
+   //  return retval;
+ 
+   // --------------------------------------------------------------------------
+   ublas::vector<NumericT> rhs;
+   ublas::vector<NumericT> result;
+   ublas::compressed_matrix<NumericT> ublas_matrix;
+ 
+   if (viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx") == EXIT_FAILURE)
+   {
+     std::cout << "Error reading Matrix file" << std::endl;
+     return EXIT_FAILURE;
+   }
+   //unsigned int cg_mat_size = cg_mat.size();
+   std::cout << "done reading matrix" << std::endl;
+ 
+ 
+   rhs.resize(ublas_matrix.size2());
+   for (std::size_t i=0; i<rhs.size(); ++i)
+   {
+     ublas_matrix(i,i) = NumericT(0.5);   // Get rid of round-off errors by making row-sums unequal to zero:
+     rhs[i] = NumericT(1) + random<NumericT>();
+   }
+ 
+   // add some random numbers to the double-compressed matrix:
+   ublas::compressed_matrix<NumericT> ublas_cc_matrix(ublas_matrix.size1(), ublas_matrix.size2());
+   ublas_cc_matrix(42,199) = NumericT(3.1415);
+   ublas_cc_matrix(31, 69) = NumericT(2.71);
+   ublas_cc_matrix(23, 32) = NumericT(6);
+   ublas_cc_matrix(177,57) = NumericT(4);
+   ublas_cc_matrix(21, 97) = NumericT(-4);
+   ublas_cc_matrix(92, 25) = NumericT(2);
+   ublas_cc_matrix(89, 62) = NumericT(11);
+   ublas_cc_matrix(1,   7) = NumericT(8);
+   ublas_cc_matrix(85, 41) = NumericT(13);
+   ublas_cc_matrix(66, 28) = NumericT(8);
+   ublas_cc_matrix(21, 74) = NumericT(-2);
+ 
+ 
+   result = rhs;
+ 
+ 
+   viennacl::vector<NumericT> vcl_rhs(rhs.size());
+   viennacl::vector<NumericT> vcl_result(result.size());
+   viennacl::vector<NumericT> vcl_result2(result.size());
+   viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
+   viennacl::compressed_compressed_matrix<NumericT> vcl_compressed_compressed_matrix(rhs.size(), rhs.size());
+   viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
+   viennacl::ell_matrix<NumericT> vcl_ell_matrix;
+   viennacl::hyb_matrix<NumericT> vcl_hyb_matrix;
+ 
+   viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+   viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+   viennacl::copy(ublas_cc_matrix, vcl_compressed_compressed_matrix);
+   viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+ 
+   // --------------------------------------------------------------------------
+   std::cout << "Testing products: ublas" << std::endl;
+   result     = viennacl::linalg::prod(ublas_matrix, rhs);
+ 
+   std::cout << "Testing products: compressed_matrix" << std::endl;
+   vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing products: compressed_matrix, strided vectors" << std::endl;
+   retval = strided_matrix_vector_product_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+   if (retval != EXIT_SUCCESS)
+     return retval;
+ 
+   //
+   // Triangular solvers for A \ b:
+   //
+   ublas::compressed_matrix<NumericT> ublas_matrix_trans(ublas_matrix.size2(), ublas_matrix.size1(), ublas_matrix.nnz()); // = trans(ublas_matrix); //note: triangular solvers with uBLAS show atrocious performance, while transposed solvers are quite okay. To keep execution times short, we use a double-transpose-trick in the following.
+ 
+   // fast transpose:
+   for (typename ublas::compressed_matrix<NumericT>::iterator1 row_it  = ublas_matrix.begin1();
+                                                               row_it != ublas_matrix.end1();
+                                                             ++row_it)
+   {
+     for (typename ublas::compressed_matrix<NumericT>::iterator2 col_it  = row_it.begin();
+                                                                 col_it != row_it.end();
+                                                               ++col_it)
      {
-       std::cout << "Error reading Matrix file" << std::endl;
-       return EXIT_FAILURE;
+       ublas_matrix_trans(col_it.index1(), col_it.index2()) = *col_it;
      }
-     //unsigned int cg_mat_size = cg_mat.size(); 
-     std::cout << "done reading matrix" << std::endl;
- 
-     if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-     {
-       std::cout << "Error reading RHS file" << std::endl;
-       return EXIT_FAILURE;
-     }
-     std::cout << "done reading rhs" << std::endl;
- 
-     if (!readVectorFromFile("../../examples/testdata/result65025.txt", result))
+   }
+ 
+ 
+   std::cout << "Testing unit upper triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::unit_upper_tag());
+   viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::unit_upper_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: unit upper triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing upper triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::upper_tag());
+   viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::upper_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: upper triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing unit lower triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::unit_lower_tag());
+   viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::unit_lower_tag());
+ 
+   /*std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+   std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+   std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+   std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+   std::list< std::size_t > multifrontal_L_row_elimination_num_list_;
+ 
+   viennacl::vector<NumericT> multifrontal_U_diagonal_;
+ 
+   viennacl::switch_memory_domain(multifrontal_U_diagonal_, viennacl::MAIN_MEMORY);
+   multifrontal_U_diagonal_.resize(vcl_compressed_matrix.size1(), false);
+   viennacl::linalg::single_threaded::detail::row_info(vcl_compressed_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+ 
+   viennacl::linalg::detail::multifrontal_setup_L(vcl_compressed_matrix,
+                                                   multifrontal_U_diagonal_, //dummy
+                                                   multifrontal_L_row_index_arrays_,
+                                                   multifrontal_L_row_buffers_,
+                                                   multifrontal_L_col_buffers_,
+                                                   multifrontal_L_element_buffers_,
+                                                   multifrontal_L_row_elimination_num_list_);
+ 
+   viennacl::linalg::detail::multifrontal_substitute(vcl_result,
+                                                     multifrontal_L_row_index_arrays_,
+                                                     multifrontal_L_row_buffers_,
+                                                     multifrontal_L_col_buffers_,
+                                                     multifrontal_L_element_buffers_,
+                                                     multifrontal_L_row_elimination_num_list_);*/
+ 
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: unit lower triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+ 
+   std::cout << "Testing lower triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::lower_tag());
+   viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::lower_tag());
+ 
+   /*std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+   std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+   std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+   std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+   std::list< std::size_t > multifrontal_U_row_elimination_num_list_;
+ 
+   multifrontal_U_diagonal_.resize(vcl_compressed_matrix.size1(), false);
+   viennacl::linalg::single_threaded::detail::row_info(vcl_compressed_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+   viennacl::linalg::detail::multifrontal_setup_U(vcl_compressed_matrix,
+                                                  multifrontal_U_diagonal_,
+                                                  multifrontal_U_row_index_arrays_,
+                                                  multifrontal_U_row_buffers_,
+                                                  multifrontal_U_col_buffers_,
+                                                  multifrontal_U_element_buffers_,
+                                                  multifrontal_U_row_elimination_num_list_);
+ 
+   vcl_result = viennacl::linalg::element_div(vcl_result, multifrontal_U_diagonal_);
+   viennacl::linalg::detail::multifrontal_substitute(vcl_result,
+                                                     multifrontal_U_row_index_arrays_,
+                                                     multifrontal_U_row_buffers_,
+                                                     multifrontal_U_col_buffers_,
+                                                     multifrontal_U_element_buffers_,
+                                                     multifrontal_U_row_elimination_num_list_);*/
+ 
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+ /*
+   std::cout << "Testing lower triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(ublas_matrix, result, boost::numeric::ublas::lower_tag());
+   viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::lower_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }*/
+ 
+   //
+   // Triangular solvers for A^T \ b
+   //
+ 
+   std::cout << "Testing transposed unit upper triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::unit_upper_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::unit_upper_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: unit upper triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing transposed upper triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::upper_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::upper_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: upper triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+ 
+   std::cout << "Testing transposed unit lower triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::unit_lower_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::unit_lower_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: unit lower triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing transposed lower triangular solve: compressed_matrix" << std::endl;
+   result = rhs;
+   viennacl::copy(result, vcl_result);
+   boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::lower_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::lower_tag());
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+ 
+   std::cout << "Testing products: compressed_compressed_matrix" << std::endl;
+   result     = viennacl::linalg::prod(ublas_cc_matrix, rhs);
+   vcl_result = viennacl::linalg::prod(vcl_compressed_compressed_matrix, vcl_rhs);
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   {
+     ublas::compressed_matrix<NumericT> temp(vcl_compressed_compressed_matrix.size1(), vcl_compressed_compressed_matrix.size2());
+     viennacl::copy(vcl_compressed_compressed_matrix, temp);
+ 
+     // check that entries are correct by computing the product again:
+     result     = viennacl::linalg::prod(temp, rhs);
+ 
+     if( std::fabs(diff(result, vcl_result)) > epsilon )
      {
++<<<<<<< HEAD
 +      std::cout << "Error reading Result file" << std::endl;
 +      return EXIT_FAILURE;
 +    }
 +    std::cout << "done reading result" << std::endl;
 +   
 +
 +   viennacl::vector<NumericT> vcl_rhs(rhs.size());
 +   viennacl::vector<NumericT> vcl_result(result.size()); 
 +   viennacl::vector<NumericT> vcl_result2(result.size()); 
 +   viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
 +   viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
 +   viennacl::ell_matrix<NumericT> vcl_ell_matrix(rhs.size(), rhs.size());
 +   viennacl::hyb_matrix<NumericT> vcl_hyb_matrix(rhs.size(), rhs.size());
 +
 +   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
 +   copy(ublas_matrix, vcl_compressed_matrix);
 +   copy(ublas_matrix, vcl_coordinate_matrix);
 +
 +   // --------------------------------------------------------------------------          
 +   std::cout << "Testing products: ublas" << std::endl;
 +   result     = viennacl::linalg::prod(ublas_matrix, rhs);
 +   
 +   std::cout << "Testing products: compressed_matrix" << std::endl;
 +   vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
 +   
 +   if( fabs(diff(result, vcl_result)) > epsilon )
 +   {
 +      std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
 +      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
 +      retval = EXIT_FAILURE;
 +   }
 +   
 +   std::cout << "Copying ell_matrix" << std::endl;
 +   copy(ublas_matrix, vcl_ell_matrix);
 +   ublas_matrix.clear();
 +   copy(vcl_ell_matrix, ublas_matrix);// just to check that it's works
 +
 +
 +   std::cout << "Testing products: ell_matrix" << std::endl;
 +   vcl_result.clear();
 +   vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
 +   //viennacl::linalg::prod_impl(vcl_ell_matrix, vcl_rhs, vcl_result);
 +   //std::cout << vcl_result << "\n";
 +   std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
 +   std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
 +   
 +   if( fabs(diff(result, vcl_result)) > epsilon )
 +   {
 +      std::cout << "# Error at operation: matrix-vector product with ell_matrix" << std::endl;
 +      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
 +      retval = EXIT_FAILURE;
 +   }
 +   
 +   
 +   std::cout << "Copying hyb_matrix" << std::endl;
 +   copy(ublas_matrix, vcl_hyb_matrix);
 +   ublas_matrix.clear();
 +   copy(vcl_hyb_matrix, ublas_matrix);// just to check that it's works
 +   copy(ublas_matrix, vcl_hyb_matrix);
 + 
 +   std::cout << "Testing products: hyb_matrix" << std::endl;
 +   vcl_result.clear();
 +   vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
 +   //viennacl::linalg::prod_impl(vcl_hyb_matrix, vcl_rhs, vcl_result);
 +   //std::cout << vcl_result << "\n";
 +   std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
 +   std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
 +   
 +   if( fabs(diff(result, vcl_result)) > epsilon )
 +   {
 +      std::cout << "# Error at operation: matrix-vector product with hyb_matrix" << std::endl;
 +      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
 +      retval = EXIT_FAILURE;
 +   }
 +
 +   
 +   // --------------------------------------------------------------------------            
 +   // --------------------------------------------------------------------------            
 +   NumericT alpha = static_cast<NumericT>(2.786);
 +   NumericT beta = static_cast<NumericT>(1.432);
 +   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
 +   copy(result.begin(), result.end(), vcl_result.begin());
 +   copy(result.begin(), result.end(), vcl_result2.begin());
 +
 +   std::cout << "Testing scaled additions of products and vectors" << std::endl;
 +   result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
 +   vcl_result2 = alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result;
 +
 +   if( fabs(diff(result, vcl_result2)) > epsilon )
 +   {
 +      std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
 +      std::cout << "  diff: " << fabs(diff(result, vcl_result2)) << std::endl;
 +      retval = EXIT_FAILURE;
 +   }
- 
-    
- /*   vcl_result2 = alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result;
- 
-    if( fabs(diff(result, vcl_result2)) > epsilon )
-    {
-       std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
-       std::cout << "  diff: " << fabs(diff(result, vcl_result2)) << std::endl;
++=======
+       std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix (after copy back)" << std::endl;
+       std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
        retval = EXIT_FAILURE;
-    }*/
- 
-    
-    // --------------------------------------------------------------------------            
-    return retval;
+     }
++>>>>>>> upstream/1.5.1
+ 
+   }
+ 
+ 
+ 
+ 
+   std::cout << "Testing products: coordinate_matrix" << std::endl;
+   result     = viennacl::linalg::prod(ublas_matrix, rhs);
+   vcl_result = viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs);
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product with coordinate_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing products: coordinate_matrix, strided vectors" << std::endl;
+   //std::cout << " --> SKIPPING <--" << std::endl;
+   retval = strided_matrix_vector_product_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+   if (retval != EXIT_SUCCESS)
+     return retval;
+ 
+ 
+   //std::cout << "Copying ell_matrix" << std::endl;
+   viennacl::copy(ublas_matrix, vcl_ell_matrix);
+   ublas_matrix.clear();
+   viennacl::copy(vcl_ell_matrix, ublas_matrix);// just to check that it's works
+ 
+ 
+   std::cout << "Testing products: ell_matrix" << std::endl;
+   result     = viennacl::linalg::prod(ublas_matrix, rhs);
+   vcl_result.clear();
+   vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
+   //viennacl::linalg::prod_impl(vcl_ell_matrix, vcl_rhs, vcl_result);
+   //std::cout << vcl_result << "\n";
+   //std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+   //std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product with ell_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing products: ell_matrix, strided vectors" << std::endl;
+   retval = strided_matrix_vector_product_test<NumericT, viennacl::ell_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+   if (retval != EXIT_SUCCESS)
+     return retval;
+ 
+ 
+   //std::cout << "Copying hyb_matrix" << std::endl;
+   viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+   ublas_matrix.clear();
+   viennacl::copy(vcl_hyb_matrix, ublas_matrix);// just to check that it's works
+   viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+ 
+   std::cout << "Testing products: hyb_matrix" << std::endl;
+   result     = viennacl::linalg::prod(ublas_matrix, rhs);
+   vcl_result.clear();
+   vcl_result = viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs);
+   //viennacl::linalg::prod_impl(vcl_hyb_matrix, vcl_rhs, vcl_result);
+   //std::cout << vcl_result << "\n";
+   //std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+   //std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
+ 
+   if( std::fabs(diff(result, vcl_result)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product with hyb_matrix" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   std::cout << "Testing products: hyb_matrix, strided vectors" << std::endl;
+   retval = strided_matrix_vector_product_test<NumericT, viennacl::hyb_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+   if (retval != EXIT_SUCCESS)
+     return retval;
+ 
+ 
+   // --------------------------------------------------------------------------
+   // --------------------------------------------------------------------------
+   NumericT alpha = static_cast<NumericT>(2.786);
+   NumericT beta = static_cast<NumericT>(1.432);
+   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+   copy(result.begin(), result.end(), vcl_result.begin());
+   copy(result.begin(), result.end(), vcl_result2.begin());
+ 
+   std::cout << "Testing scaled additions of products and vectors" << std::endl;
+   result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+   vcl_result2 = alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result;
+ 
+   if( std::fabs(diff(result, vcl_result2)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+ 
+   vcl_result2.clear();
+   vcl_result2 = alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result;
+ 
+   if( std::fabs(diff(result, vcl_result2)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   vcl_result2.clear();
+   vcl_result2 = alpha * viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs) + beta * vcl_result;
+ 
+   if( std::fabs(diff(result, vcl_result2)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product (ell_matrix) with scaled additions" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+   vcl_result2.clear();
+   vcl_result2 = alpha * viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs) + beta * vcl_result;
+ 
+   if( std::fabs(diff(result, vcl_result2)) > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-vector product (hyb_matrix) with scaled additions" << std::endl;
+     std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+     retval = EXIT_FAILURE;
+   }
+ 
+ 
+   // --------------------------------------------------------------------------
+   return retval;
  }
  //
  // -------------------------------------------------------------
diff --cc tests/src/svd.cpp
index 88bfcd8,81775e6..10ac144
--- a/tests/src/svd.cpp
+++ b/tests/src/svd.cpp
@@@ -1,3 -1,20 +1,23 @@@
++<<<<<<< HEAD
++=======
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
++>>>>>>> upstream/1.5.1
  #include <stdexcept>
  #include <iostream>
  #include <string>
@@@ -11,107 -28,123 +31,226 @@@
  
  #include "examples/benchmarks/benchmark-utils.hpp"
  
++<<<<<<< HEAD
 +typedef float ScalarType;
 +
 +const float EPS = 0.001;
 +
 +void read_matrix_size(std::fstream& f, unsigned int& sz1, unsigned int& sz2) {
 +    if(!f.is_open())
 +        throw std::invalid_argument("File is not opened");
 +
 +    f >> sz1 >> sz2;
 +}
 +
 +void read_matrix_body(std::fstream& f, viennacl::matrix<ScalarType>& A) {
 +    if(!f.is_open())
 +        throw std::invalid_argument("File is not opened");
 +
 +	boost::numeric::ublas::matrix<float> h_A(A.size1(), A.size2());
 +
 +    for(unsigned int i = 0; i < h_A.size1(); i++) {
 +        for(unsigned int j = 0; j < h_A.size2(); j++) {
 +            ScalarType val = 0.0;
 +            f >> val;
 +            h_A(i, j) = val;
 +        }
 +    }
 +
 +	viennacl::copy(h_A, A);
 +}
 +
 +void read_vector_body(std::fstream& f, std::vector<ScalarType>& v) {
 +    if(!f.is_open())
 +        throw std::invalid_argument("File is not opened");
 +
 +    for(unsigned int i = 0; i < v.size(); i++)
 +    {
 +            ScalarType val = 0.0;
 +            f >> val;
 +            v[i] = val;
 +    }
 +}
 +
 +void random_fill(std::vector<ScalarType>& in) {
 +    for(unsigned int i = 0; i < in.size(); i++) {
 +        in[i] = (float)rand() / RAND_MAX;
 +    }
 +}
 +
 +bool check_bidiag(viennacl::matrix<ScalarType>& A) {
 +    const float EPS = 0.0001f;
 +
 +    std::vector<ScalarType> aA(A.size1() * A.size2());
 +    viennacl::fast_copy(A, &aA[0]);
 +
 +    for(unsigned int i = 0; i < A.size1(); i++) {
 +        for(unsigned int j = 0; j < A.size2(); j++) {
 +            ScalarType val = aA[i * A.size2() + j];
 +            if((fabs(val) > EPS) && (i != j) && ((i + 1) != j)) {
 +                std::cout << "Failed at " << i << " " << j << " " << val << std::endl;
 +                return false;
 +            }
 +        }
 +    }
 +
 +    return true;
 +}
 +
 +float matrix_compare(viennacl::matrix<ScalarType>& res,
 +                     viennacl::matrix<ScalarType>& ref) 
 +{
 +    std::vector<ScalarType> res_std(res.internal_size());
 +    std::vector<ScalarType> ref_std(ref.internal_size());
 +
 +    viennacl::fast_copy(res, &res_std[0]);
 +    viennacl::fast_copy(ref, &ref_std[0]);
 +
 +    float diff = 0.0;
 +    float mx = 0.0;
 +
 +    for(unsigned int i = 0; i < res_std.size(); i++) {
 +        diff = std::max(diff, std::abs(res_std[i] - ref_std[i]));
 +        mx = std::max(mx, res_std[i]);
 +    }
 +
 +    return diff / mx;
 +}
 +
 +float sigmas_compare(viennacl::matrix<ScalarType>& res, 
 +                        std::vector<ScalarType>& ref) 
 +{
 +    std::vector<ScalarType> res_std(ref.size());
 +
 +    for(size_t i = 0; i < ref.size(); i++)
 +    {
 +        res_std[i] = res(i, i);
 +    }
++=======
+ 
+ void read_matrix_size(std::fstream& f, std::size_t & sz1, std::size_t & sz2)
+ {
+   if(!f.is_open())
+     throw std::invalid_argument("File is not opened");
+ 
+   f >> sz1 >> sz2;
+ }
+ 
+ 
+ template <typename ScalarType>
+ void read_matrix_body(std::fstream& f, viennacl::matrix<ScalarType>& A)
+ {
+   if(!f.is_open())
+     throw std::invalid_argument("File is not opened");
+ 
+   boost::numeric::ublas::matrix<ScalarType> h_A(A.size1(), A.size2());
+ 
+   for(std::size_t i = 0; i < h_A.size1(); i++)
+   {
+     for(std::size_t j = 0; j < h_A.size2(); j++)
+     {
+       ScalarType val = 0.0;
+       f >> val;
+       h_A(i, j) = val;
+     }
+   }
+ 
+   viennacl::copy(h_A, A);
+ }
+ 
+ 
+ template <typename ScalarType>
+ void read_vector_body(std::fstream& f, std::vector<ScalarType>& v)
+ {
+   if(!f.is_open())
+     throw std::invalid_argument("File is not opened");
+ 
+   for(std::size_t i = 0; i < v.size(); i++)
+   {
+     ScalarType val = 0.0;
+     f >> val;
+     v[i] = val;
+   }
+ }
+ 
+ 
+ template <typename ScalarType>
+ void random_fill(std::vector<ScalarType>& in)
+ {
+   for(std::size_t i = 0; i < in.size(); i++)
+     in[i] = static_cast<ScalarType>(rand()) / RAND_MAX;
+ }
+ 
+ 
+ template <typename ScalarType>
+ bool check_bidiag(viennacl::matrix<ScalarType>& A)
+ {
+   const ScalarType EPS = 0.0001f;
+ 
+   std::vector<ScalarType> aA(A.size1() * A.size2());
+   viennacl::fast_copy(A, &aA[0]);
+ 
+   for(std::size_t i = 0; i < A.size1(); i++)
+   {
+     for(std::size_t j = 0; j < A.size2(); j++)
+     {
+       ScalarType val = aA[i * A.size2() + j];
+       if((fabs(val) > EPS) && (i != j) && ((i + 1) != j))
+       {
+         std::cout << "Failed at " << i << " " << j << " " << val << std::endl;
+         return false;
+       }
+     }
+   }
+ 
+   return true;
+ }
+ 
+ template <typename ScalarType>
+ ScalarType matrix_compare(viennacl::matrix<ScalarType>& res,
+                      viennacl::matrix<ScalarType>& ref)
+ {
+   std::vector<ScalarType> res_std(res.internal_size());
+   std::vector<ScalarType> ref_std(ref.internal_size());
+ 
+   viennacl::fast_copy(res, &res_std[0]);
+   viennacl::fast_copy(ref, &ref_std[0]);
+ 
+   ScalarType diff = 0.0;
+   ScalarType mx = 0.0;
+ 
+   for(std::size_t i = 0; i < res_std.size(); i++)
+   {
+     diff = std::max(diff, std::abs(res_std[i] - ref_std[i]));
+     mx = std::max(mx, res_std[i]);
+   }
+ 
+   return diff / mx;
+ }
+ 
+ 
+ template <typename ScalarType>
+ ScalarType sigmas_compare(viennacl::matrix<ScalarType>& res,
+                                std::vector<ScalarType>& ref)
+ {
+     std::vector<ScalarType> res_std(ref.size());
+ 
+     for(std::size_t i = 0; i < ref.size(); i++)
+         res_std[i] = res(i, i);
++>>>>>>> upstream/1.5.1
  
      std::sort(ref.begin(), ref.end());
      std::sort(res_std.begin(), res_std.end());
  
++<<<<<<< HEAD
 +    float diff = 0.0;
 +    float mx = 0.0;
 +    for(size_t i = 0; i < ref.size(); i++) 
++=======
+     ScalarType diff = 0.0;
+     ScalarType mx = 0.0;
+     for(std::size_t i = 0; i < ref.size(); i++)
++>>>>>>> upstream/1.5.1
      {
          diff = std::max(diff, std::abs(res_std[i] - ref[i]));
          mx = std::max(mx, res_std[i]);
@@@ -121,94 -154,158 +260,251 @@@
  }
  
  
++<<<<<<< HEAD
 +void test_svd(const std::string& fn) 
 +{
 +    unsigned int sz1, sz2;
 +
 +    //read matrix
 +
 +    // sz1 = 2048, sz2 = 2048;
 +    // std::vector<ScalarType> in(sz1 * sz2);
 +    // random_fill(in);
 +
 +    // read file
 +    std::fstream f(fn.c_str(), std::fstream::in);
 +    //read size of input matrix
 +    read_matrix_size(f, sz1, sz2);
 +
 +    unsigned int to = std::min(sz1, sz2);
 +
 +    viennacl::matrix<ScalarType> Ai(sz1, sz2), Aref(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
 +    read_matrix_body(f, Ai);
 +
 +    std::vector<ScalarType> sigma_ref(to);
 +    read_vector_body(f, sigma_ref);
 +
 +    f.close();
 +
 +    // viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
 +
 +    Aref = Ai;
 +
 +    Timer timer;
 +    timer.start();
 +
 +    viennacl::linalg::svd(Ai, QL, QR);
 +
 +    viennacl::ocl::get_queue().finish();
 +
 +    double time_spend = timer.get();
 +
 +    viennacl::matrix<ScalarType> result1(sz1, sz2), result2(sz1, sz2);
 +    result1 = viennacl::linalg::prod(QL, Ai);
 +    result2 = viennacl::linalg::prod(result1, trans(QR));
 +
 +    float sigma_diff = sigmas_compare(Ai, sigma_ref);
 +    float prods_diff  = matrix_compare(result2, Aref);
 +
 +    bool sigma_ok = (fabs(sigma_diff) < EPS) && (fabs(prods_diff) < EPS);
 +
 +	printf("%6s [%dx%d] %40s sigma_diff = %.6f; prod_diff = %.6f; time = %.6f\n", sigma_ok?"[[OK]]":"[FAIL]", (int)Aref.size1(), (int)Aref.size2(), fn.c_str(), sigma_diff, prods_diff, time_spend);
 +}
 +
 +
 +void time_svd(size_t sz1, size_t sz2) 
 +{
 +
 +    std::vector<ScalarType> in(sz1 * sz2);
 +    random_fill(in);
 +
 +    viennacl::matrix<ScalarType> Ai(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
 +
 +    viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
 +
 +
 +    Timer timer;
 +    timer.start();
 +
 +    viennacl::linalg::svd(Ai, QL, QR);
 +
 +    viennacl::ocl::get_queue().finish();
 +
 +    double time_spend = timer.get();
 +
 +    printf("[%dx%d] time = %.6f\n", (int)sz1, (int)sz2, time_spend);
 +}
 +
 +int main() 
 +{
 +
 +    test_svd(std::string("../../examples/testdata/svd/qr.example"));
 +    test_svd(std::string("../../examples/testdata/svd/wiki.example"));
 +    test_svd(std::string("../../examples/testdata/svd/wiki.qr.example"));
 +    test_svd(std::string("../../examples/testdata/svd/pysvd.example"));
 +    test_svd(std::string("../../examples/testdata/svd/random.example"));
 +
 +    time_svd(500, 500);
 +    time_svd(1000, 1000);
 +    time_svd(4096, 512);
 +    time_svd(2048, 2048);
++=======
+ template <typename ScalarType>
+ void test_svd(const std::string & fn, ScalarType EPS)
+ {
+   std::size_t sz1, sz2;
+ 
+   //read matrix
+ 
+   // sz1 = 2048, sz2 = 2048;
+   // std::vector<ScalarType> in(sz1 * sz2);
+   // random_fill(in);
+ 
+   // read file
+   std::fstream f(fn.c_str(), std::fstream::in);
+   //read size of input matrix
+   read_matrix_size(f, sz1, sz2);
+ 
+   std::size_t to = std::min(sz1, sz2);
+ 
+   viennacl::matrix<ScalarType> Ai(sz1, sz2), Aref(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
+   read_matrix_body(f, Ai);
+ 
+   std::vector<ScalarType> sigma_ref(to);
+   read_vector_body(f, sigma_ref);
+ 
+   f.close();
+ 
+   // viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
+ 
+   Aref = Ai;
+ 
+   Timer timer;
+   timer.start();
+ 
+   viennacl::linalg::svd(Ai, QL, QR);
+ 
+   viennacl::backend::finish();
+ 
+   double time_spend = timer.get();
+ 
+   viennacl::matrix<ScalarType> result1(sz1, sz2), result2(sz1, sz2);
+   result1 = viennacl::linalg::prod(QL, Ai);
+   result2 = viennacl::linalg::prod(result1, trans(QR));
+ 
+   ScalarType sigma_diff = sigmas_compare(Ai, sigma_ref);
+   ScalarType prods_diff  = matrix_compare(result2, Aref);
+ 
+   bool sigma_ok = (fabs(sigma_diff) < EPS)
+                    && (fabs(prods_diff) < std::sqrt(EPS));  //note: computing the product is not accurate down to 10^{-16}, so we allow for a higher tolerance here
+ 
+   printf("%6s [%dx%d] %40s sigma_diff = %.6f; prod_diff = %.6f; time = %.6f\n", sigma_ok?"[[OK]]":"[FAIL]", (int)Aref.size1(), (int)Aref.size2(), fn.c_str(), sigma_diff, prods_diff, time_spend);
+ }
+ 
+ 
+ template <typename ScalarType>
+ void time_svd(std::size_t sz1, std::size_t sz2)
+ {
+   viennacl::matrix<ScalarType> Ai(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
+ 
+   std::vector<ScalarType> in(Ai.internal_size1() * Ai.internal_size2());
+   random_fill(in);
+ 
+   viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
+ 
+ 
+   Timer timer;
+   timer.start();
+ 
+   viennacl::linalg::svd(Ai, QL, QR);
+   viennacl::backend::finish();
+   double time_spend = timer.get();
+ 
+   printf("[%dx%d] time = %.6f\n", static_cast<int>(sz1), static_cast<int>(sz2), time_spend);
+ }
+ 
+ 
+ template <typename ScalarType>
+ int test(ScalarType epsilon)
+ {
+ 
+     test_svd<ScalarType>(std::string("../../examples/testdata/svd/qr.example"), epsilon);
+     test_svd<ScalarType>(std::string("../../examples/testdata/svd/wiki.example"), epsilon);
+     test_svd<ScalarType>(std::string("../../examples/testdata/svd/wiki.qr.example"), epsilon);
+     test_svd<ScalarType>(std::string("../../examples/testdata/svd/pysvd.example"), epsilon);
+     test_svd<ScalarType>(std::string("../../examples/testdata/svd/random.example"), epsilon);
+ 
+     time_svd<ScalarType>(500, 500);
+     time_svd<ScalarType>(1000, 1000);
+     time_svd<ScalarType>(4096, 512);
+     time_svd<ScalarType>(2048, 2048);
++>>>>>>> upstream/1.5.1
      //time_svd(4096, 4096);  //takes too long for a standard sanity test. Feel free to uncomment
  
      return EXIT_SUCCESS;
  }
++<<<<<<< HEAD
++=======
+ 
+ //
+ // -------------------------------------------------------------
+ //
+ int main()
+ {
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << "## Test :: BLAS 3 routines" << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+ 
+    int retval = EXIT_SUCCESS;
+ 
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+    {
+       typedef float NumericT;
+       NumericT epsilon = NumericT(1.0E-4);
+       std::cout << "# Testing setup:" << std::endl;
+       std::cout << "  eps:     " << epsilon << std::endl;
+       std::cout << "  numeric: float" << std::endl;
+       retval = test<NumericT>(epsilon);
+       if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+       else
+         return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+    if( viennacl::ocl::current_device().double_support() )
+    {
+       {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-6;  //Note: higher accuracy not possible, because data only available with floating point precision
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         retval = test<NumericT>(epsilon);
+         if( retval == EXIT_SUCCESS )
+           std::cout << "# Test passed" << std::endl;
+         else
+           return retval;
+       }
+       std::cout << std::endl;
+       std::cout << "----------------------------------------------" << std::endl;
+       std::cout << std::endl;
+    }
+ 
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+ 
+ 
+    return retval;
+ }
+ 
+ 
++>>>>>>> upstream/1.5.1
diff --cc viennacl/ell_matrix.hpp
index 0242c33,6e8af98..7ea705f
--- a/viennacl/ell_matrix.hpp
+++ b/viennacl/ell_matrix.hpp
@@@ -2,24 -2,25 +2,41 @@@
  #define VIENNACL_ELL_MATRIX_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
 +/** @file ell_matrix.hpp
 +    @brief Implementation of the ell_matrix class
 +    
++=======
+ /** @file viennacl/ell_matrix.hpp
+     @brief Implementation of the ell_matrix class
+ 
++>>>>>>> upstream/1.5.1
      Contributed by Volodymyr Kysenko.
  */
  
@@@ -28,65 -29,99 +45,159 @@@
  #include "viennacl/vector.hpp"
  
  #include "viennacl/tools/tools.hpp"
++<<<<<<< HEAD
 +#include "viennacl/ocl/backend.hpp"
 +
 +#include "viennacl/linalg/kernels/ell_matrix_kernels.h"
 +
 +namespace viennacl
 +{
 +    template<typename SCALARTYPE, unsigned int ALIGNMENT /* see forwards.h for default argument */>
 +    class ell_matrix
 +    {
 +
 +      public:
 +        ell_matrix() 
 +        {
 +          viennacl::linalg::kernels::ell_matrix<SCALARTYPE, ALIGNMENT>::init();
 +        }
 +        
 +        ell_matrix(std::size_t row_num, std::size_t col_num) 
 +        {
 +          viennacl::linalg::kernels::ell_matrix<SCALARTYPE, ALIGNMENT>::init();
 +        }
 +    
 +      public:
 +        std::size_t internal_size1() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(rows_, ALIGNMENT); }
 +        std::size_t internal_size2() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(cols_, ALIGNMENT); }
 +
 +        std::size_t size1() const { return rows_; }
 +        std::size_t size2() const { return cols_; }
 +        
 +        std::size_t internal_maxnnz() const {return viennacl::tools::roundUpToNextMultiple<std::size_t>(maxnnz_, ALIGNMENT); }
 +        std::size_t maxnnz() const { return maxnnz_; }
 +
 +        std::size_t nnz() const { return rows_ * maxnnz_; }
 +        std::size_t internal_nnz() const { return internal_size1() * internal_maxnnz(); }
 +
 +        const viennacl::ocl::handle<cl_mem>& handle1( ) const { return elements_; } 
 +        const viennacl::ocl::handle<cl_mem>& handle2() const { return coords_; }
 +
 +        template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
 +        friend void copy(const CPU_MATRIX & cpu_matrix, ell_matrix<T, ALIGN> & gpu_matrix );
 +
 +      private:
 +        std::size_t rows_;
 +        std::size_t cols_;
 +        std::size_t maxnnz_;
 +
 +        viennacl::ocl::handle<cl_mem> coords_;
 +        viennacl::ocl::handle<cl_mem> elements_;        
++=======
+ 
+ #include "viennacl/linalg/sparse_matrix_operations.hpp"
+ 
+ namespace viennacl
+ {
+     /** @brief Sparse matrix class using the ELLPACK format for storing the nonzeros.
+       *
+       * This format works best for matrices where the number of nonzeros per row is mostly the same.
+       * Finite element and finite difference methods on nicely shaped domains often result in such a nonzero pattern.
+       * For a matrix
+       *
+       *   (1 2 0 0 0)
+       *   (2 3 4 0 0)
+       *   (0 5 6 0 7)
+       *   (0 0 8 9 0)
+       *
+       * the entries are layed out in chunks of size 3 as
+       *   (1 2 5 8; 2 3 6 9; 0 4 7 0)
+       * Note that this is a 'transposed' representation in order to maximize coalesced memory access.
+       */
+     template<typename SCALARTYPE, unsigned int ALIGNMENT /* see forwards.h for default argument */>
+     class ell_matrix
+     {
+       public:
+         typedef viennacl::backend::mem_handle                                                              handle_type;
+         typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+         typedef vcl_size_t                                                                                 size_type;
+ 
+         ell_matrix() : rows_(0), cols_(0), maxnnz_(0) {}
+ 
+         ell_matrix(viennacl::context ctx) : rows_(0), cols_(0), maxnnz_(0)
+         {
+             coords_.switch_active_handle_id(ctx.memory_type());
+           elements_.switch_active_handle_id(ctx.memory_type());
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+           if (ctx.memory_type() == OPENCL_MEMORY)
+           {
+               coords_.opencl_handle().context(ctx.opencl_context());
+             elements_.opencl_handle().context(ctx.opencl_context());
+           }
+ #endif
+         }
+ 
+       public:
+         vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, ALIGNMENT); }
+         vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, ALIGNMENT); }
+ 
+         vcl_size_t size1() const { return rows_; }
+         vcl_size_t size2() const { return cols_; }
+ 
+         vcl_size_t internal_maxnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(maxnnz_, ALIGNMENT); }
+         vcl_size_t maxnnz() const { return maxnnz_; }
+ 
+         vcl_size_t nnz() const { return rows_ * maxnnz_; }
+         vcl_size_t internal_nnz() const { return internal_size1() * internal_maxnnz(); }
+ 
+               handle_type & handle()       { return elements_; }
+         const handle_type & handle() const { return elements_; }
+ 
+               handle_type & handle2()       { return coords_; }
+         const handle_type & handle2() const { return coords_; }
+ 
+       #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
+         template <typename CPU_MATRIX>
+         friend void copy(const CPU_MATRIX & cpu_matrix, ell_matrix & gpu_matrix );
+       #else
+         template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
+         friend void copy(const CPU_MATRIX & cpu_matrix, ell_matrix<T, ALIGN> & gpu_matrix );
+       #endif
+ 
+       private:
+         vcl_size_t rows_;
+         vcl_size_t cols_;
+         vcl_size_t maxnnz_;
+ 
+         handle_type coords_;
+         handle_type elements_;
++>>>>>>> upstream/1.5.1
      };
  
      template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
      void copy(const CPU_MATRIX& cpu_matrix, ell_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix )
      {
++<<<<<<< HEAD
 +      if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
 +      {
 +        //determine max capacity for row
 +        std::size_t max_entries_per_row = 0;
 +        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
 +        {
 +          std::size_t num_entries = 0;
++=======
+       assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+       assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+ 
+       if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+       {
+         //determine max capacity for row
+         vcl_size_t max_entries_per_row = 0;
+         for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+         {
+           vcl_size_t num_entries = 0;
++>>>>>>> upstream/1.5.1
            for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
            {
                ++num_entries;
@@@ -100,62 -135,54 +211,106 @@@
          gpu_matrix.rows_ = cpu_matrix.size1();
          gpu_matrix.cols_ = cpu_matrix.size2();
  
++<<<<<<< HEAD
 +        std::size_t nnz = gpu_matrix.internal_nnz();
 +
 +        std::vector<cl_uint> coords(nnz, 0);
 +        std::vector<SCALARTYPE> elements(nnz, 0);
 +
 +        // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " " 
++=======
+         vcl_size_t nnz = gpu_matrix.internal_nnz();
+ 
+         viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), nnz);
+         std::vector<SCALARTYPE> elements(nnz, 0);
+ 
+         // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " "
++>>>>>>> upstream/1.5.1
          //             << gpu_matrix.internal_maxnnz() << "\n";
  
          for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
          {
++<<<<<<< HEAD
 +          std::size_t data_index = 0;
 +          
 +          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
 +          {
 +            coords[gpu_matrix.internal_size1() * data_index + col_it.index1()]   = col_it.index2();
++=======
+           vcl_size_t data_index = 0;
+ 
+           for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+           {
+             coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
++>>>>>>> upstream/1.5.1
              elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
              //std::cout << *col_it << "\n";
                data_index++;
            }
          }
  
++<<<<<<< HEAD
 +
 +        gpu_matrix.coords_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_ONLY, coords);
 +        gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_ONLY, elements);
++=======
+         viennacl::backend::memory_create(gpu_matrix.handle2(), coords.raw_size(),                   traits::context(gpu_matrix.handle2()), coords.get());
+         viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * elements.size(), traits::context(gpu_matrix.handle()), &(elements[0]));
++>>>>>>> upstream/1.5.1
        }
      }
  
      template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
      void copy(const ell_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix, CPU_MATRIX& cpu_matrix)
      {
++<<<<<<< HEAD
 +      if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
 +      {
 +        cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2());
 +
 +        std::vector<SCALARTYPE> elements(gpu_matrix.internal_nnz());
 +        std::vector<cl_uint> coords(gpu_matrix.internal_nnz());
 +
 +        cl_int err;
 +
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(), CL_TRUE, 0, sizeof(SCALARTYPE) * elements.size(), &(elements[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(), CL_TRUE, 0, sizeof(cl_uint) * coords.size(), &(coords[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +
 +        viennacl::ocl::get_queue().finish();
 +
 +        for(std::size_t row = 0; row < gpu_matrix.size1(); row++)
 +        {
 +          for(std::size_t ind = 0; ind < gpu_matrix.internal_maxnnz(); ind++)
 +          {
 +            std::size_t offset = gpu_matrix.internal_size1() * ind + row;
 +            
 +            if(elements[offset] == static_cast<SCALARTYPE>(0.0))
 +            {
 +                continue;
 +            }
++=======
+       assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+       assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+ 
+       if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+       {
+         std::vector<SCALARTYPE> elements(gpu_matrix.internal_nnz());
+         viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), gpu_matrix.internal_nnz());
+ 
+         viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE) * elements.size(), &(elements[0]));
+         viennacl::backend::memory_read(gpu_matrix.handle2(), 0, coords.raw_size(), coords.get());
+ 
+         for(vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+         {
+           for(vcl_size_t ind = 0; ind < gpu_matrix.internal_maxnnz(); ind++)
+           {
+             vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+ 
+             if(elements[offset] == static_cast<SCALARTYPE>(0.0))
+                 continue;
++>>>>>>> upstream/1.5.1
  
              if(coords[offset] >= gpu_matrix.size2())
              {
@@@ -169,108 -196,101 +324,207 @@@
        }
      }
  
++<<<<<<< HEAD
 +    namespace linalg
 +    {
 +      /** @brief Returns a proxy class that represents matrix-vector multiplication with a hyb_matrix
 +      *
 +      * This is used for the convenience expression result = prod(mat, vec);
 +      *
 +      * @param mat    The matrix
 +      * @param vec    The vector
 +      */
 +      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +      vector_expression<const ell_matrix<SCALARTYPE, ALIGNMENT>,
 +                        const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                        op_prod > prod_impl(const ell_matrix<SCALARTYPE, ALIGNMENT> & mat, 
 +                                            const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
 +      {
 +        return vector_expression<const ell_matrix<SCALARTYPE, ALIGNMENT>,
 +                                 const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                 op_prod >(mat, vec);
 +      }
 +      
 +      template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +      void prod_impl(
 +                      const viennacl::ell_matrix<TYPE, ALIGNMENT>& mat, 
 +                      const viennacl::vector<TYPE, VECTOR_ALIGNMENT>& vec,
 +                      viennacl::vector<TYPE, VECTOR_ALIGNMENT>& result)
 +      {
 +        assert(mat.size1() == result.size());
 +        assert(mat.size2() == vec.size());
 +
 +        result.clear();
 +
 +        std::stringstream ss;
 +        ss << "vec_mul_" << 1;//(ALIGNMENT != 1?4:1);
 +        viennacl::ocl::kernel& k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::ell_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
 +
 +        unsigned int thread_num = 128;
 +        unsigned int group_num = 256;
 +
 +        k.local_work_size(0, thread_num);
 +        k.global_work_size(0, thread_num * group_num);
 +
 +        viennacl::ocl::enqueue(k(mat.handle2(), 
 +                                 mat.handle1(),
 +                                 vec,
 +                                 result,
 +                                 cl_uint(mat.size1()),
 +                                 cl_uint(mat.size2()),
 +                                 cl_uint(mat.internal_size1()),
 +                                 cl_uint(mat.maxnnz()),
 +                                 cl_uint(mat.internal_maxnnz())
 +                                ) 
 +        );
 +
 +
 +      }
 +    }
 +    
 +    
 +    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
 +    *
 +    * @param proxy  An expression template proxy class.
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +    template <unsigned int MAT_ALIGNMENT>
 +    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
 +    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const ell_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
 +                                                                                          viennacl::op_prod> & proxy) 
 +    {
 +      // check for the special case x = A * x
 +      if (proxy.rhs().handle().get() == this->handle().get())
 +      {
 +        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
 +        *this = result;
 +        return *this;
 +      }
 +      else
 +      {
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      return *this;
 +    }
 +    
 +}
 +
 +#endif
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +
++=======
+ 
+     //
+     // Specify available operations:
+     //
+ 
+     /** \cond */
+ 
+     namespace linalg
+     {
+       namespace detail
+       {
+         // x = A * y
+         template <typename T, unsigned int A>
+         struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+             {
+               // check for the special case x = A * x
+               if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+               {
+                 viennacl::vector<T> temp(lhs);
+                 viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                 lhs = temp;
+               }
+               else
+                 viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+             }
+         };
+ 
+         template <typename T, unsigned int A>
+         struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+               lhs += temp;
+             }
+         };
+ 
+         template <typename T, unsigned int A>
+         struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+               lhs -= temp;
+             }
+         };
+ 
+ 
+         // x = A * vec_op
+         template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+         struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+               viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+             }
+         };
+ 
+         // x = A * vec_op
+         template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+         struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+               viennacl::vector<T> temp_result(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+               lhs += temp_result;
+             }
+         };
+ 
+         // x = A * vec_op
+         template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+         struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+               viennacl::vector<T> temp_result(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+               lhs -= temp_result;
+             }
+         };
+ 
+      } // namespace detail
+    } // namespace linalg
+ 
+     /** \endcond */
+ }
+ 
+ #endif
++>>>>>>> upstream/1.5.1
  
  
diff --cc viennacl/forwards.h
index 8695f96,d2ba91f..8d995c9
--- a/viennacl/forwards.h
+++ b/viennacl/forwards.h
@@@ -23,10 -24,10 +24,14 @@@
  */
  
  /**
++<<<<<<< HEAD
 + @mainpage Source Code Documentation for ViennaCL 1.3.0
++=======
+  @mainpage Source Code Documentation for ViennaCL 1.5.1
++>>>>>>> upstream/1.5.1
  
   This is the source code documentation of ViennaCL. Detailed information about the functions in ViennaCL can be found here.
-  
+ 
   For a general overview over the types and functionality provided by ViennaCL, please refer to the file doc/viennacl.pdf
  
  */
@@@ -117,73 -284,237 +288,261 @@@ namespace viennac
  
    //
    // Matrix types:
-   //  
+   //
+ 
+   template<class SCALARTYPE, typename F = row_major, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+   class matrix_base;
+ 
    template <class SCALARTYPE, typename F = row_major, unsigned int ALIGNMENT = 1>
    class matrix;
-   
+ 
+   template<typename SCALARTYPE>
+   class implicit_matrix_base;
+ 
+   template <class SCALARTYPE>
+   class identity_matrix;
+ 
+   template <class SCALARTYPE>
+   class zero_matrix;
+ 
+   template <class SCALARTYPE>
+   class scalar_matrix;
+ 
    template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
    class compressed_matrix;
-   
+ 
+   template<class SCALARTYPE>
+   class compressed_compressed_matrix;
+ 
+ 
    template<class SCALARTYPE, unsigned int ALIGNMENT = 128>
-   class coordinate_matrix;    
+   class coordinate_matrix;
+ 
+   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+   class ell_matrix;
+ 
+   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+   class hyb_matrix;
  
    template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
 +  class ell_matrix;
 +
 +  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
 +  class hyb_matrix;
 +  
 +  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
    class circulant_matrix;
-     
+ 
    template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
    class hankel_matrix;
-   
+ 
    template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
    class toeplitz_matrix;
-   
+ 
    template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
    class vandermonde_matrix;
-   
+ 
    //
    // Proxies:
    //
-   template <typename SizeType = std::size_t, typename DistanceType = std::ptrdiff_t>
+   template <typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
    class basic_range;
-   
+ 
    typedef basic_range<>  range;
  
++<<<<<<< HEAD
 +  template <typename SizeType = std::size_t, typename DistanceType = std::ptrdiff_t>
 +  class basic_slice;
 +  
++=======
+   template <typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
+   class basic_slice;
+ 
++>>>>>>> upstream/1.5.1
    typedef basic_slice<>  slice;
  
    template <typename VectorType>
    class vector_range;
++<<<<<<< HEAD
 +  
 +  template <typename VectorType>
 +  class vector_slice;
 +  
++=======
+ 
+   template <typename VectorType>
+   class vector_slice;
+ 
++>>>>>>> upstream/1.5.1
    template <typename MatrixType>
    class matrix_range;
  
    template <typename MatrixType>
    class matrix_slice;
++<<<<<<< HEAD
 +  
 +  
++=======
+ 
+ 
+   /** @brief Helper struct for checking whether a type is a host scalar type (e.g. float, double) */
++>>>>>>> upstream/1.5.1
    template <typename T>
-   struct is_scalar;
+   struct is_cpu_scalar
+   {
+     enum { value = false };
+   };
  
+   /** @brief Helper struct for checking whether a type is a viennacl::scalar<> */
    template <typename T>
-   struct is_vector;
+   struct is_scalar
+   {
+     enum { value = false };
+   };
  
+   /** @brief Helper struct for checking whether a type represents a sign flip on a viennacl::scalar<> */
    template <typename T>
-   struct is_matrix;
-   
+   struct is_flip_sign_scalar
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper struct for checking whether the provided type represents a scalar (either host, from ViennaCL, or a flip-sign proxy) */
+   template <typename T>
+   struct is_any_scalar
+   {
+     enum { value = (is_scalar<T>::value || is_cpu_scalar<T>::value || is_flip_sign_scalar<T>::value )};
+   };
+ 
+   /** @brief Checks for a type being either vector_base or implicit_vector_base */
+   template<typename T>
+   struct is_any_vector { enum { value = 0 }; };
+ 
+   /** @brief Checks for either matrix_base or implicit_matrix_base */
+   template<typename T>
+   struct is_any_dense_matrix { enum { value = 0 }; };
+ 
+   /** @brief Helper class for checking whether a matrix has a row-major layout. */
+   template <typename T>
+   struct is_row_major
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is a compressed_matrix (CSR format) */
+   template <typename T>
+   struct is_compressed_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is a coordinate_matrix (COO format) */
+   template <typename T>
+   struct is_coordinate_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is an ell_matrix (ELL format) */
+   template <typename T>
+   struct is_ell_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is a hyb_matrix (hybrid format: ELL plus CSR) */
+   template <typename T>
+   struct is_hyb_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether the provided type is one of the sparse matrix types (compressed_matrix, coordinate_matrix, etc.) */
+   template <typename T>
+   struct is_any_sparse_matrix
+   {
+     enum { value = false };
+   };
+ 
+ 
+   /** @brief Helper class for checking whether a matrix is a circulant matrix */
+   template <typename T>
+   struct is_circulant_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is a Hankel matrix */
+   template <typename T>
+   struct is_hankel_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is a Toeplitz matrix */
+   template <typename T>
+   struct is_toeplitz_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether a matrix is a Vandermonde matrix */
+   template <typename T>
+   struct is_vandermonde_matrix
+   {
+     enum { value = false };
+   };
+ 
+   /** @brief Helper class for checking whether the provided type is any of the dense structured matrix types (circulant, Hankel, etc.) */
+   template <typename T>
+   struct is_any_dense_structured_matrix
+   {
+     enum { value = viennacl::is_circulant_matrix<T>::value || viennacl::is_hankel_matrix<T>::value || viennacl::is_toeplitz_matrix<T>::value || viennacl::is_vandermonde_matrix<T>::value };
+   };
+ 
+ 
+   enum memory_types
+   {
+     MEMORY_NOT_INITIALIZED
+     , MAIN_MEMORY
+     , OPENCL_MEMORY
+     , CUDA_MEMORY
+   };
+ 
+   /** @brief Exception class in case of memory errors */
+   class memory_exception : public std::exception
+   {
+   public:
+     memory_exception() : message_() {}
+     memory_exception(std::string message) : message_("ViennaCL: Internal memory error: " + message) {}
+ 
+     virtual const char* what() const throw() { return message_.c_str(); }
+ 
+     virtual ~memory_exception() throw() {}
+   private:
+     std::string message_;
+   };
+ 
+   class cuda_not_available_exception : public std::exception
+   {
+   public:
+     cuda_not_available_exception() : message_("ViennaCL was compiled without CUDA support, but CUDA functionality required for this operation.") {}
+ 
+     virtual const char* what() const throw() { return message_.c_str(); }
+ 
+     virtual ~cuda_not_available_exception() throw() {}
+   private:
+     std::string message_;
+   };
+ 
+ 
+   class context;
+ 
    namespace tools
    {
-     //helper for matrix row/col iterators 
+     //helper for matrix row/col iterators
      //must be specialized for every viennacl matrix type
+     /** @brief Helper class for incrementing an iterator in a dense matrix. */
      template <typename ROWCOL, typename MATRIXTYPE>
      struct MATRIX_ITERATOR_INCREMENTER
      {
@@@ -200,96 -532,182 +560,238 @@@
      void convolve_i(viennacl::vector<SCALARTYPE, ALIGNMENT>& input1,
                      viennacl::vector<SCALARTYPE, ALIGNMENT>& input2,
                      viennacl::vector<SCALARTYPE, ALIGNMENT>& output);
-     
- #ifndef _MSC_VER
+ 
+     template <typename T>
+     viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_prod> >
+     element_prod(vector_base<T> const & v1, vector_base<T> const & v2);
+ 
+     template <typename T>
+     viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_div> >
+     element_div(vector_base<T> const & v1, vector_base<T> const & v2);
+ 
+ 
+ 
+     template <typename T>
+     void inner_prod_impl(vector_base<T> const & vec1,
+                          vector_base<T> const & vec2,
+                          scalar<T> & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                          vector_base<T> const & vec2,
+                          scalar<T> & result);
+ 
+     template <typename T, typename LHS, typename RHS, typename OP>
+     void inner_prod_impl(vector_base<T> const & vec1,
+                          viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                          scalar<T> & result);
+ 
+     template <typename LHS1, typename RHS1, typename OP1,
+               typename LHS2, typename RHS2, typename OP2, typename T>
+     void inner_prod_impl(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                          viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                          scalar<T> & result);
+ 
+     ///////////////////////////
+ 
+     template <typename T>
+     void inner_prod_cpu(vector_base<T> const & vec1,
+                         vector_base<T> const & vec2,
+                         T & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                         vector_base<T> const & vec2,
+                         T & result);
+ 
+     template <typename T, typename LHS, typename RHS, typename OP>
+     void inner_prod_cpu(vector_base<T> const & vec1,
+                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                         T & result);
+ 
+     template <typename LHS1, typename RHS1, typename OP1,
+               typename LHS2, typename RHS2, typename OP2, typename S3>
+     void inner_prod_cpu(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                         viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                         S3 & result);
+ 
+ 
+ 
      //forward definition of norm_1_impl function
-     template <typename V1, typename S2>
-     void norm_1_impl(V1 const & vec,
-                      S2 & result,
-                       typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                     && viennacl::is_scalar<S2>::value
-                                                   >::type * dummy = 0);
+     template <typename T>
+     void norm_1_impl(vector_base<T> const & vec, scalar<T> & result);
+ 
+     //template <typename T, typename F>
+     //void norm_1_impl(matrix_base<T, F> const & A, scalar<T> & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      scalar<T> & result);
+ 
+ 
+     template <typename T>
+     void norm_1_cpu(vector_base<T> const & vec,
+                     T & result);
+ 
+     //template <typename T, typename F>
+     //void norm_1_cpu(matrix_base<T, F> const & vec,
+     //                T & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     S2 & result);
  
      //forward definition of norm_2_impl function
-     template <typename V1, typename S2>
-     void norm_2_impl(V1 const & vec,
-                      S2 & result,
-                      typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                    && viennacl::is_scalar<S2>::value
-                                                  >::type * dummy = 0);
+     template <typename T>
+     void norm_2_impl(vector_base<T> const & vec, scalar<T> & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      scalar<T> & result);
+ 
+     template <typename T>
+     void norm_2_cpu(vector_base<T> const & vec, T & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     S2 & result);
+ 
  
      //forward definition of norm_inf_impl function
-     template <typename V1, typename S2>
-     void norm_inf_impl(V1 const & vec,
-                        S2 & result,
-                        typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                      && viennacl::is_scalar<S2>::value
-                                                    >::type * dummy = 0);
- #endif    
-     
+     template <typename T>
+     void norm_inf_impl(vector_base<T> const & vec, scalar<T> & result);
+ 
+     //template <typename T, typename F>
+     //void norm_inf_impl(matrix_base<T, F> const & vec, scalar<T> & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                       scalar<T> & result);
+ 
+ 
+     template <typename T>
+     void norm_inf_cpu(vector_base<T> const & vec, T & result);
+ 
+     //template <typename T, typename F>
+     //void norm_inf_cpu(matrix_base<T, F> const & vec, T & result);
+ 
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                       S2 & result);
+ 
+     template <typename T, typename F>
+     void norm_frobenius_impl(matrix_base<T, F> const & vec, scalar<T> & result);
+ 
+     template <typename T, typename F>
+     void norm_frobenius_cpu(matrix_base<T, F> const & vec, T & result);
+ 
+ 
+     template <typename T>
+     vcl_size_t index_norm_inf(vector_base<T> const & vec);
+ 
+     template <typename LHS, typename RHS, typename OP>
+     vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec);
+ 
      //forward definition of prod_impl functions
++<<<<<<< HEAD
 +    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +    viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> &, 
 +                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
 +
 +    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +    viennacl::vector_expression<const viennacl::compressed_matrix<SCALARTYPE, ALIGNMENT>,
 +                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod > prod_impl(const viennacl::compressed_matrix<SCALARTYPE, ALIGNMENT> & , 
 +                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
 +
 +    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +    viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
 +                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod > prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & , 
 +                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
 +
 +    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +    viennacl::vector_expression<const viennacl::ell_matrix<SCALARTYPE, ALIGNMENT>,
 +                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod > prod_impl(const viennacl::ell_matrix<SCALARTYPE, ALIGNMENT> & , 
 +                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
 +
 +    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +    viennacl::vector_expression<const viennacl::hyb_matrix<SCALARTYPE, ALIGNMENT>,
 +                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod > prod_impl(const viennacl::hyb_matrix<SCALARTYPE, ALIGNMENT> & , 
 +                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
 +                                
 +    //forward definition of inner_prod_impl function
 +    /*template <typename V1, typename V2>
 +    typename viennacl::enable_if< viennacl::is_vector<V1>::value
 +                                  && viennacl::is_vector<V2>::value,
 +                                  viennacl::scalar_expression< const V1, 
 +                                                               const V2,
 +                                                               viennacl::op_inner_prod >
 +                                >::type
 +    inner_prod_impl(V1 const & vec1,
 +                    V2 const & vec2);*/
 +    
 +#ifndef _MSC_VER
 +    template <typename V1, typename V2, typename S3>
 +    void inner_prod_impl(V1 const & vec1,
 +                         V2 const & vec2,
 +                         S3 & result,
 +                         typename viennacl::enable_if< viennacl::is_vector<V1>::value
 +                                                       && viennacl::is_vector<V2>::value
 +                                                       && viennacl::is_scalar<S3>::value
 +                                                     >::type * dummy = 0);
 +#endif                                                   
 +                    
 +      
++=======
+ 
+     template <typename NumericT, typename F>
+     void prod_impl(const matrix_base<NumericT, F> & mat,
+                    const vector_base<NumericT> & vec,
+                          vector_base<NumericT> & result);
+ 
+     template <typename NumericT, typename F>
+     void prod_impl(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                    const vector_base<NumericT> & vec,
+                          vector_base<NumericT> & result);
+ 
+     template<typename SparseMatrixType, class SCALARTYPE, unsigned int ALIGNMENT>
+     typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                   vector_expression<const SparseMatrixType,
+                                                     const vector<SCALARTYPE, ALIGNMENT>,
+                                                     op_prod >
+                                  >::type
+     prod_impl(const SparseMatrixType & mat,
+               const vector<SCALARTYPE, ALIGNMENT> & vec);
+ #endif
+ 
+     namespace detail
+     {
+       enum row_info_types
+       {
+         SPARSE_ROW_NORM_INF = 0,
+         SPARSE_ROW_NORM_1,
+         SPARSE_ROW_NORM_2,
+         SPARSE_ROW_DIAGONAL
+       };
+ 
+     }
+ 
+ 
++>>>>>>> upstream/1.5.1
      /** @brief A tag class representing a lower triangular matrix */
-     struct lower_tag 
+     struct lower_tag
      {
-       static const char * const name() { return "lower"; }
+       static const char * name() { return "lower"; }
      };      //lower triangular matrix
      /** @brief A tag class representing an upper triangular matrix */
-     struct upper_tag 
+     struct upper_tag
      {
-       static const char * const name() { return "upper"; }
+       static const char * name() { return "upper"; }
      };      //upper triangular matrix
      /** @brief A tag class representing a lower triangular matrix with unit diagonal*/
      struct unit_lower_tag
diff --cc viennacl/generator/forwards.h
index 123483e,fcf5edb..c11ea53
--- a/viennacl/generator/forwards.h
+++ b/viennacl/generator/forwards.h
@@@ -2,63 -2,141 +2,197 @@@
  #define VIENNACL_GENERATOR_FORWARDS_H
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
 +/** @file viennacl/generator/forwards.h
 + *  @brief Forward declarations of the important structures for the kernel generator
 + * 
 + *  Generator code contributed by Philippe Tillet
 + */
 +
 +#include <string>
 +#include "viennacl/forwards.h"
 +
 +namespace viennacl 
 +{
 +  namespace generator
 +  {
 +
 +    template<class T>
 +    struct is_temporary;
 +
 +    template<class LHS, class OP_TYPE, class RHS, bool is_temporary = false>
 +    class compound_node;
 +
 +    template<class T>
 +    struct inner_prod_impl_t;
 +
 +    template< unsigned int ID, typename SCALARTYPE, unsigned int ALIGNMENT = 1>
 +    class symbolic_vector;
 +
 +    template<class REF>
 +    class tmp_symbolic_vector;
 +
 +    template<unsigned int ID,
 +             typename SCALARTYPE, class F = viennacl::row_major, unsigned int ALIGNMENT = 1>
 +    class symbolic_matrix;
 +
 +    template<class REF>
 +    class tmp_symbolic_matrix;
 +
 +    template<unsigned int ID,typename SCALARTYPE>
 +    class cpu_symbolic_scalar;
 +
 +    template<unsigned int ID,typename SCALARTYPE>
 +    class gpu_symbolic_scalar;
 +
 +  }
++=======
+ 
+ /** @file viennacl/generator/forwards.h
+     @brief Forwards declaration
+ */
+ 
+ #include <map>
+ #include <set>
+ #include <list>
+ #include <stdexcept>
+ 
+ #include "viennacl/tools/shared_ptr.hpp"
+ #include "viennacl/scheduler/forwards.h"
+ 
+ namespace viennacl{
+ 
+   namespace generator{
+ 
+     inline void generate_enqueue_statement(viennacl::scheduler::statement const & s, scheduler::statement_node const & root_node);
+     inline void generate_enqueue_statement(viennacl::scheduler::statement const & s);
+ 
+     enum expression_type_family{
+       SCALAR_SAXPY_FAMILY,
+       VECTOR_SAXPY_FAMILY,
+       MATRIX_SAXPY_FAMILY,
+       SCALAR_REDUCE_FAMILY,
+       VECTOR_REDUCE_FAMILY,
+       MATRIX_PRODUCT_FAMILY,
+       INVALID_EXPRESSION_FAMILY
+     };
+ 
+     enum expression_type{
+       SCALAR_SAXPY_TYPE,
+       VECTOR_SAXPY_TYPE,
+       MATRIX_SAXPY_TYPE,
+       SCALAR_REDUCE_TYPE,
+       VECTOR_REDUCE_Nx_TYPE,
+       VECTOR_REDUCE_Tx_TYPE,
+       MATRIX_PRODUCT_NN_TYPE,
+       MATRIX_PRODUCT_TN_TYPE,
+       MATRIX_PRODUCT_NT_TYPE,
+       MATRIX_PRODUCT_TT_TYPE,
+       INVALID_EXPRESSION_TYPE
+     };
+ 
+     inline const char * expression_type_to_string(expression_type type){
+       switch(type){
+         case SCALAR_SAXPY_TYPE : return "Scalar SAXPY";
+         case VECTOR_SAXPY_TYPE : return "Vector SAXPY";
+         case MATRIX_SAXPY_TYPE : return "Matrix SAXPY";
+         case SCALAR_REDUCE_TYPE : return "Inner Product";
+         case VECTOR_REDUCE_Nx_TYPE : return "Matrix-Vector Product : Ax";
+         case VECTOR_REDUCE_Tx_TYPE : return "Matrix-Vector Product : Tx";
+         case MATRIX_PRODUCT_NN_TYPE : return "Matrix-Matrix Product : AA";
+         case MATRIX_PRODUCT_TN_TYPE : return "Matrix-Matrix Product : TA";
+         case MATRIX_PRODUCT_NT_TYPE : return "Matrix-Matrix Product : AT";
+         case MATRIX_PRODUCT_TT_TYPE : return "Matrix-Matrix Product : TT";
+         default : return "INVALID EXPRESSION";
+       }
+     }
+ 
+     typedef std::pair<expression_type, vcl_size_t> expression_key_type;
+ 
+     /** @brief A class for holding meta information such as the type or the underlying scalar type of an expression (such as x = inner_prod(y, z)). */
+     struct expression_descriptor{
+         expression_key_type make_key() const { return expression_key_type(type,scalartype_size); }
+         bool operator==(expression_descriptor const & other) const
+         {
+           return type_family == other.type_family && type == other.type && scalartype_size==other.scalartype_size;
+         }
+         expression_type_family type_family;
+         expression_type type;
+         vcl_size_t scalartype_size;
+     };
+ 
+     /** @brief Emulation of C++11's .at() member for std::map<> */
+     template <typename KeyT, typename ValueT>
+     ValueT const & at(std::map<KeyT, ValueT> const & map, KeyT const & key)
+     {
+       typename std::map<KeyT, ValueT>::const_iterator it = map.find(key);
+       if (it != map.end())
+         return it->second;
+ 
+       throw std::out_of_range("Generator: Key not found in map");
+     }
+ 
+     namespace utils{
+       class kernel_generation_stream;
+     }
+ 
+     namespace detail{
+ 
+       enum node_type{
+         LHS_NODE_TYPE,
+         PARENT_NODE_TYPE,
+         RHS_NODE_TYPE
+       };
+ 
+       class mapped_object;
+ 
+       typedef std::pair<viennacl::scheduler::statement_node const *, node_type> key_type;
+       typedef tools::shared_ptr<detail::mapped_object> container_ptr_type;
+       typedef std::map<key_type, container_ptr_type> mapping_type;
+ 
+       template<class Fun>
+       static void traverse(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, Fun const & fun, bool recurse_binary_leaf = true);
+       inline std::string generate(std::pair<std::string, std::string> const & index, int vector_element, mapped_object const & s);
+       static std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int vector_size, mapped_object const & s);
+       static void fetch(std::pair<std::string, std::string> const & index, unsigned int vectorization, std::set<std::string> & fetched, utils::kernel_generation_stream & stream, mapped_object & s);
+       inline const char * generate(viennacl::scheduler::operation_node_type type);
+       static void generate_all_rhs(viennacl::scheduler::statement const & statement
+                                 , viennacl::scheduler::statement_node const & root_node
+                                 , std::pair<std::string, std::string> const & index
+                                 , int vector_element
+                                 , std::string & str
+                                 , detail::mapping_type const & mapping);
+ 
+     }
+ 
+   }
+ 
++>>>>>>> upstream/1.5.1
  }
  #endif
diff --cc viennacl/hyb_matrix.hpp
index deaf71c,d04de34..2adf074
--- a/viennacl/hyb_matrix.hpp
+++ b/viennacl/hyb_matrix.hpp
@@@ -2,24 -2,25 +2,41 @@@
  #define VIENNACL_HYB_MATRIX_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
 +/** @file hyb_matrix.hpp
 +    @brief Implementation of the hyb_matrix class
 +    
++=======
+ /** @file viennacl/hyb_matrix.hpp
+     @brief Implementation of the hyb_matrix class
+ 
++>>>>>>> upstream/1.5.1
      Contributed by Volodymyr Kysenko.
  */
  
@@@ -27,77 -28,101 +44,171 @@@
  #include "viennacl/vector.hpp"
  
  #include "viennacl/tools/tools.hpp"
++<<<<<<< HEAD
 +#include "viennacl/ocl/backend.hpp"
 +
 +#include "viennacl/linalg/kernels/hyb_matrix_kernels.h"
 +
 +namespace viennacl
 +{
 +    template<typename SCALARTYPE, unsigned int ALIGNMENT  /* see forwards.h for default argument */>
 +    class hyb_matrix
 +    {
 +
 +      public:
 +        hyb_matrix() : csr_threshold_(0.8), rows_(0), cols_(0) 
 +        {
 +          viennacl::linalg::kernels::hyb_matrix<SCALARTYPE, ALIGNMENT>::init();
 +        }
 +        
 +        hyb_matrix(std::size_t row_num, std::size_t col_num) : csr_threshold_(0.8), rows_(row_num), cols_(col_num)
 +        {
 +          viennacl::linalg::kernels::hyb_matrix<SCALARTYPE, ALIGNMENT>::init();
++=======
+ 
+ #include "viennacl/linalg/sparse_matrix_operations.hpp"
+ 
+ namespace viennacl
+ {
+     /** @brief Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros. */
+     template<typename SCALARTYPE, unsigned int ALIGNMENT  /* see forwards.h for default argument */>
+     class hyb_matrix
+     {
+       public:
+         typedef viennacl::backend::mem_handle                                                              handle_type;
+         typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+ 
+         hyb_matrix() : csr_threshold_(SCALARTYPE(0.8)), rows_(0), cols_(0) {}
+ 
+         hyb_matrix(viennacl::context ctx) : csr_threshold_(SCALARTYPE(0.8)), rows_(0), cols_(0)
+         {
+             ell_coords_.switch_active_handle_id(ctx.memory_type());
+           ell_elements_.switch_active_handle_id(ctx.memory_type());
+ 
+               csr_rows_.switch_active_handle_id(ctx.memory_type());
+               csr_cols_.switch_active_handle_id(ctx.memory_type());
+           csr_elements_.switch_active_handle_id(ctx.memory_type());
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+           if (ctx.memory_type() == OPENCL_MEMORY)
+           {
+               ell_coords_.opencl_handle().context(ctx.opencl_context());
+             ell_elements_.opencl_handle().context(ctx.opencl_context());
+ 
+                 csr_rows_.opencl_handle().context(ctx.opencl_context());
+                 csr_cols_.opencl_handle().context(ctx.opencl_context());
+             csr_elements_.opencl_handle().context(ctx.opencl_context());
+           }
+ #endif
++>>>>>>> upstream/1.5.1
          }
  
          SCALARTYPE  csr_threshold()  const { return csr_threshold_; }
          void csr_threshold(SCALARTYPE thr) { csr_threshold_ = thr; }
  
++<<<<<<< HEAD
 +        std::size_t internal_size1() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(rows_, ALIGNMENT); }
 +        std::size_t internal_size2() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(cols_, ALIGNMENT); }
 +
 +        std::size_t size1() const { return rows_; }
 +        std::size_t size2() const { return cols_; }
 +
 +        std::size_t internal_ellnnz() const {return viennacl::tools::roundUpToNextMultiple<std::size_t>(ellnnz_, ALIGNMENT); }
 +        std::size_t ell_nnz() const { return ellnnz_; }
 +        std::size_t csr_nnz() const { return csrnnz_; }
 +
 +        const viennacl::ocl::handle<cl_mem>& handle1() const { return ell_elements_; } 
 +        const viennacl::ocl::handle<cl_mem>& handle2() const { return ell_coords_; }
 +        const viennacl::ocl::handle<cl_mem>& handle3() const { return csr_rows_; } 
 +        const viennacl::ocl::handle<cl_mem>& handle4() const { return csr_cols_; } 
 +        const viennacl::ocl::handle<cl_mem>& handle5() const { return csr_elements_; }  
 +    
 +      public:    
 +        template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
 +        friend void copy(const CPU_MATRIX & cpu_matrix, hyb_matrix<T, ALIGN> & gpu_matrix );
 +
 +      private:
 +        SCALARTYPE  csr_threshold_;
 +        std::size_t rows_;
 +        std::size_t cols_;
 +        std::size_t ellnnz_;
 +        std::size_t csrnnz_;
 +
 +        viennacl::ocl::handle<cl_mem> ell_coords_; // ell coords
 +        viennacl::ocl::handle<cl_mem> ell_elements_; // ell elements
 +        
 +        viennacl::ocl::handle<cl_mem> csr_rows_;
 +        viennacl::ocl::handle<cl_mem> csr_cols_;
 +        viennacl::ocl::handle<cl_mem> csr_elements_;
++=======
+         vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, ALIGNMENT); }
+         vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, ALIGNMENT); }
+ 
+         vcl_size_t size1() const { return rows_; }
+         vcl_size_t size2() const { return cols_; }
+ 
+         vcl_size_t internal_ellnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(ellnnz_, ALIGNMENT); }
+         vcl_size_t ell_nnz() const { return ellnnz_; }
+         vcl_size_t csr_nnz() const { return csrnnz_; }
+ 
+         const handle_type & handle() const { return ell_elements_; }
+         const handle_type & handle2() const { return ell_coords_; }
+         const handle_type & handle3() const { return csr_rows_; }
+         const handle_type & handle4() const { return csr_cols_; }
+         const handle_type & handle5() const { return csr_elements_; }
+ 
+       public:
+       #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
+         template <typename CPU_MATRIX>
+         friend void copy(const CPU_MATRIX & cpu_matrix, hyb_matrix & gpu_matrix );
+       #else
+         template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
+         friend void copy(const CPU_MATRIX & cpu_matrix, hyb_matrix<T, ALIGN> & gpu_matrix );
+       #endif
+ 
+       private:
+         SCALARTYPE  csr_threshold_;
+         vcl_size_t rows_;
+         vcl_size_t cols_;
+         vcl_size_t ellnnz_;
+         vcl_size_t csrnnz_;
+ 
+         handle_type ell_coords_; // ell coords
+         handle_type ell_elements_; // ell elements
+ 
+         handle_type csr_rows_;
+         handle_type csr_cols_;
+         handle_type csr_elements_;
++>>>>>>> upstream/1.5.1
      };
  
      template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
      void copy(const CPU_MATRIX& cpu_matrix, hyb_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix )
      {
++<<<<<<< HEAD
 +      if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
 +      {
 +        //determine max capacity for row
 +        std::size_t max_entries_per_row = 0;
 +        std::vector<std::size_t> hist_entries(cpu_matrix.size1(), 0);
 +
 +        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
 +        {
 +            std::size_t num_entries = 0;
++=======
+       assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+       assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+ 
+       if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+       {
+         //determine max capacity for row
+         vcl_size_t max_entries_per_row = 0;
+         std::vector<vcl_size_t> hist_entries(cpu_matrix.size1() + 1, 0);
+ 
+         for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+         {
+             vcl_size_t num_entries = 0;
++>>>>>>> upstream/1.5.1
              for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
              {
                  ++num_entries;
@@@ -106,12 -131,12 +217,21 @@@
              hist_entries[num_entries] += 1;
              max_entries_per_row = std::max(max_entries_per_row, num_entries);
          }
++<<<<<<< HEAD
 +        
 +        std::size_t sum = 0;
 +        for(std::size_t ind = 0; ind <= max_entries_per_row; ind++)
 +        {
 +            sum += hist_entries[ind];
 +            
++=======
+ 
+         vcl_size_t sum = 0;
+         for(vcl_size_t ind = 0; ind <= max_entries_per_row; ind++)
+         {
+             sum += hist_entries[ind];
+ 
++>>>>>>> upstream/1.5.1
              if(sum >= gpu_matrix.csr_threshold() * cpu_matrix.size1())
              {
                  max_entries_per_row = ind;
@@@ -124,33 -149,33 +244,62 @@@
          gpu_matrix.rows_ = cpu_matrix.size1();
          gpu_matrix.cols_ = cpu_matrix.size2();
  
++<<<<<<< HEAD
 +        std::size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
 +
 +        std::vector<cl_uint> ell_coords(nnz, 0);
 +        std::vector<cl_uint> csr_rows(cpu_matrix.size1() + 1, 0);
 +        std::vector<cl_uint> csr_cols;
 +
 +        std::vector<SCALARTYPE> ell_elements(nnz, 0.0f);
 +        std::vector<SCALARTYPE> csr_elements;
 +
 +        std::size_t csr_index = 0;
 +
 +        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
 +        {
 +          std::size_t data_index = 0;
 +  
 +          csr_rows[row_it.index1()] = csr_index;
 +          
++=======
+         vcl_size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
+ 
+         viennacl::backend::typesafe_host_array<unsigned int>  ell_coords(gpu_matrix.ell_coords_, nnz);
+         viennacl::backend::typesafe_host_array<unsigned int>  csr_rows(gpu_matrix.csr_rows_, cpu_matrix.size1() + 1);
+         std::vector<unsigned int> csr_cols;
+ 
+         std::vector<SCALARTYPE> ell_elements(nnz);
+         std::vector<SCALARTYPE> csr_elements;
+ 
+         vcl_size_t csr_index = 0;
+ 
+         for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+         {
+           vcl_size_t data_index = 0;
+ 
+           csr_rows.set(row_it.index1(), csr_index);
+ 
++>>>>>>> upstream/1.5.1
            for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
            {
              if(data_index < max_entries_per_row)
              {
++<<<<<<< HEAD
 +                ell_coords[gpu_matrix.internal_size1() * data_index + col_it.index1()]   = col_it.index2();
 +                ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;                        
 +            }
 +            else
 +            {
 +                csr_cols.push_back(col_it.index2());
++=======
+                 ell_coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+                 ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+             }
+             else
+             {
+                 csr_cols.push_back(static_cast<unsigned int>(col_it.index2()));
++>>>>>>> upstream/1.5.1
                  csr_elements.push_back(*col_it);
  
                  csr_index++;
@@@ -167,55 -192,51 +316,100 @@@
            csr_elements.push_back(0);
          }
  
++<<<<<<< HEAD
 +        csr_rows[csr_rows.size() - 1] = csr_index;
 +
 +        gpu_matrix.csrnnz_ = csr_cols.size();
 +
 +        gpu_matrix.ell_coords_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, ell_coords);
 +        gpu_matrix.ell_elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, ell_elements);
 +
 +        gpu_matrix.csr_rows_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, csr_rows);
 +        gpu_matrix.csr_cols_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, csr_cols);
 +        gpu_matrix.csr_elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, csr_elements);
 +
++=======
+         csr_rows.set(csr_rows.size() - 1, csr_index);
+ 
+         gpu_matrix.csrnnz_ = csr_cols.size();
+ 
+         viennacl::backend::typesafe_host_array<unsigned int> csr_cols_for_gpu(gpu_matrix.csr_cols_, csr_cols.size());
+         for (vcl_size_t i=0; i<csr_cols.size(); ++i)
+           csr_cols_for_gpu.set(i, csr_cols[i]);
+ 
+         viennacl::backend::memory_create(gpu_matrix.ell_coords_,   ell_coords.raw_size(),                    traits::context(gpu_matrix.ell_coords_), ell_coords.get());
+         viennacl::backend::memory_create(gpu_matrix.ell_elements_, sizeof(SCALARTYPE) * ell_elements.size(), traits::context(gpu_matrix.ell_elements_), &(ell_elements[0]));
+ 
+         viennacl::backend::memory_create(gpu_matrix.csr_rows_,     csr_rows.raw_size(),                      traits::context(gpu_matrix.csr_rows_), csr_rows.get());
+         viennacl::backend::memory_create(gpu_matrix.csr_cols_,     csr_cols_for_gpu.raw_size(),              traits::context(gpu_matrix.csr_cols_), csr_cols_for_gpu.get());
+         viennacl::backend::memory_create(gpu_matrix.csr_elements_, sizeof(SCALARTYPE) * csr_elements.size(), traits::context(gpu_matrix.csr_elements_), &(csr_elements[0]));
++>>>>>>> upstream/1.5.1
        }
      }
  
      template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
      void copy(const hyb_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix, CPU_MATRIX& cpu_matrix)
      {
++<<<<<<< HEAD
 +      if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
 +      {
 +        cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2());
 +
 +        std::vector<SCALARTYPE> ell_elements(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
 +        std::vector<cl_uint> ell_coords(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
 +
 +        std::vector<SCALARTYPE> csr_elements(gpu_matrix.csr_nnz());
 +        std::vector<cl_uint> csr_rows(gpu_matrix.size1() + 1);
 +        std::vector<cl_uint> csr_cols(gpu_matrix.csr_nnz());
 +
 +        cl_int err;
 +
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(), CL_TRUE, 0, sizeof(SCALARTYPE) * ell_elements.size(), &(ell_elements[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(), CL_TRUE, 0, sizeof(cl_uint) * ell_coords.size(), &(ell_coords[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle3(), CL_TRUE, 0, sizeof(cl_uint) * csr_rows.size(), &(csr_rows[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle4(), CL_TRUE, 0, sizeof(cl_uint) * csr_cols.size(), &(csr_cols[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle5(), CL_TRUE, 0, sizeof(SCALARTYPE) * csr_elements.size(), &(csr_elements[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +
 +        viennacl::ocl::get_queue().finish();
 +
 +        for(std::size_t row = 0; row < gpu_matrix.size1(); row++)
 +        {
 +          for(std::size_t ind = 0; ind < gpu_matrix.internal_ellnnz(); ind++)
 +          {
 +            std::size_t offset = gpu_matrix.internal_size1() * ind + row;
 +            
++=======
+       assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+       assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+ 
+       if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+       {
+         std::vector<SCALARTYPE> ell_elements(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+         viennacl::backend::typesafe_host_array<unsigned int> ell_coords(gpu_matrix.handle2(), gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+ 
+         std::vector<SCALARTYPE> csr_elements(gpu_matrix.csr_nnz());
+         viennacl::backend::typesafe_host_array<unsigned int> csr_rows(gpu_matrix.handle3(), gpu_matrix.size1() + 1);
+         viennacl::backend::typesafe_host_array<unsigned int> csr_cols(gpu_matrix.handle4(), gpu_matrix.csr_nnz());
+ 
+         viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE) * ell_elements.size(), &(ell_elements[0]));
+         viennacl::backend::memory_read(gpu_matrix.handle2(), 0, ell_coords.raw_size(), ell_coords.get());
+         viennacl::backend::memory_read(gpu_matrix.handle3(), 0, csr_rows.raw_size(),   csr_rows.get());
+         viennacl::backend::memory_read(gpu_matrix.handle4(), 0, csr_cols.raw_size(),   csr_cols.get());
+         viennacl::backend::memory_read(gpu_matrix.handle5(), 0, sizeof(SCALARTYPE) * csr_elements.size(), &(csr_elements[0]));
+ 
+ 
+         for(vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+         {
+           for(vcl_size_t ind = 0; ind < gpu_matrix.internal_ellnnz(); ind++)
+           {
+             vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+ 
++>>>>>>> upstream/1.5.1
              if(ell_elements[offset] == static_cast<SCALARTYPE>(0.0))
              {
                continue;
@@@ -230,7 -251,7 +424,11 @@@
              cpu_matrix(row, ell_coords[offset]) = ell_elements[offset];
            }
  
++<<<<<<< HEAD
 +          for(std::size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
++=======
+           for(vcl_size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
++>>>>>>> upstream/1.5.1
            {
              if(csr_elements[ind] == static_cast<SCALARTYPE>(0.0))
              {
@@@ -250,88 -271,98 +448,186 @@@
      }
  
  
++<<<<<<< HEAD
 +    namespace linalg
 +    {
 +      
 +      /** @brief Returns a proxy class that represents matrix-vector multiplication with a hyb_matrix
 +      *
 +      * This is used for the convenience expression result = prod(mat, vec);
 +      *
 +      * @param mat    The matrix
 +      * @param vec    The vector
 +      */
 +      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +      vector_expression<const hyb_matrix<SCALARTYPE, ALIGNMENT>,
 +                        const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                        op_prod > prod_impl(const hyb_matrix<SCALARTYPE, ALIGNMENT> & mat, 
 +                                      const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
 +      {
 +        return vector_expression<const hyb_matrix<SCALARTYPE, ALIGNMENT>,
 +                                const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod >(mat, vec);
 +      }
 +      
 +      template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +      void prod_impl( const viennacl::hyb_matrix<TYPE, ALIGNMENT>& mat, 
 +                      const viennacl::vector<TYPE, VECTOR_ALIGNMENT>& vec,
 +                      viennacl::vector<TYPE, VECTOR_ALIGNMENT>& result)
 +      {
 +        assert(mat.size1() == result.size());
 +        assert(mat.size2() == vec.size());
 +
 +        result.clear();
 +
 +        viennacl::ocl::kernel& k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::hyb_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
 +
 +        unsigned int thread_num = 256;
 +        unsigned int group_num = 32;
 +
 +        k.local_work_size(0, thread_num);
 +        k.global_work_size(0, thread_num * group_num);
 +
 +        viennacl::ocl::enqueue(k(mat.handle2(), 
 +                                mat.handle1(),
 +                                mat.handle3(),
 +                                mat.handle4(),
 +                                mat.handle5(),
 +                                vec,
 +                                result,
 +                                cl_uint(mat.size1()),
 +                                cl_uint(mat.internal_size1()),
 +                                cl_uint(mat.ell_nnz()),
 +                                cl_uint(mat.internal_ellnnz())
 +                                ) 
 +        );
 +      }
 +    }
 +
 +    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
 +    *
 +    * @param proxy  An expression template proxy class.
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +    template <unsigned int MAT_ALIGNMENT>
 +    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
 +    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const hyb_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
 +                                                                                          viennacl::op_prod> & proxy) 
 +    {
 +      // check for the special case x = A * x
 +      if (proxy.rhs().handle().get() == this->handle().get())
 +      {
 +        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
 +        *this = result;
 +        return *this;
 +      }
 +      else
 +      {
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      return *this;
 +    }
 +
 +}
 +
- #endif
++#endif
++=======
+     //
+     // Specify available operations:
+     //
+ 
+     /** \cond */
+ 
+     namespace linalg
+     {
+       namespace detail
+       {
+         // x = A * y
+         template <typename T, unsigned int A>
+         struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+             {
+               // check for the special case x = A * x
+               if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+               {
+                 viennacl::vector<T> temp(lhs);
+                 viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                 lhs = temp;
+               }
+               else
+                 viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+             }
+         };
+ 
+         template <typename T, unsigned int A>
+         struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+               lhs += temp;
+             }
+         };
+ 
+         template <typename T, unsigned int A>
+         struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+               lhs -= temp;
+             }
+         };
+ 
+ 
+         // x = A * vec_op
+         template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+         struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+               viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+             }
+         };
+ 
+         // x = A * vec_op
+         template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+         struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+               viennacl::vector<T> temp_result(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+               lhs += temp_result;
+             }
+         };
+ 
+         // x = A * vec_op
+         template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+         struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+         {
+             static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+             {
+               viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+               viennacl::vector<T> temp_result(lhs);
+               viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+               lhs -= temp_result;
+             }
+         };
+ 
+       } // namespace detail
+     } // namespace linalg
+ 
+     /** \endcond */
+ }
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/linalg/bicgstab.hpp
index d3609a6,642b490..38dedbd
--- a/viennacl/linalg/bicgstab.hpp
+++ b/viennacl/linalg/bicgstab.hpp
@@@ -102,47 -108,52 +108,62 @@@ namespace viennac
        CPU_ScalarType beta;
        CPU_ScalarType alpha;
        CPU_ScalarType omega;
++<<<<<<< HEAD
 +      ScalarType inner_prod_temp; //temporary variable for inner product computation
 +      ScalarType new_ip_rr0star = 0;
 +      
 +      if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
 +        return result;
 +      
 +      for (unsigned int i = 0; i < tag.max_iterations(); ++i)
++=======
+       //ScalarType inner_prod_temp; //temporary variable for inner product computation
+       CPU_ScalarType new_ip_rr0star = 0;
+       CPU_ScalarType residual_norm = norm_rhs_host;
+ 
+       if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
+         return result;
+ 
+       bool restart_flag = true;
+       vcl_size_t last_restart = 0;
+       for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
++>>>>>>> upstream/1.5.1
        {
+         if (restart_flag)
+         {
+           residual = rhs;
+           residual -= viennacl::linalg::prod(matrix, result);
+           p = residual;
+           r0star = residual;
+           ip_rr0star = viennacl::linalg::norm_2(residual);
+           ip_rr0star *= ip_rr0star;
+           restart_flag = false;
+           last_restart = i;
+         }
+ 
          tag.iters(i+1);
          tmp0 = viennacl::linalg::prod(matrix, p);
-         //alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
-         inner_prod_temp = viennacl::linalg::inner_prod(tmp0, r0star);
-         alpha = ip_rr0star / static_cast<CPU_ScalarType>(inner_prod_temp);
- 
-         //s = residual - alpha*tmp0;
-         s = residual;
-         s -= alpha*tmp0;
-         
+         alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+ 
+         s = residual - alpha*tmp0;
+ 
          tmp1 = viennacl::linalg::prod(matrix, s);
-         //omega = viennacl::linalg::inner_prod(tmp1, s) / viennacl::linalg::inner_prod(tmp1, tmp1);
-         inner_prod_temp = viennacl::linalg::inner_prod(tmp1, s);
-         omega = inner_prod_temp;
-         inner_prod_temp = viennacl::linalg::inner_prod(tmp1, tmp1);
-         omega /= inner_prod_temp;
-         
-         //result += alpha * p + omega * s;
-         result += alpha * p;
-         result += omega * s;
-         
-         //residual = s - omega * tmp1;
-         residual = s;
-         residual -= omega*tmp1;
-         
-         new_ip_rr0star = viennacl::linalg::inner_prod(residual,r0star);
-         if (fabs(CPU_ScalarType(viennacl::linalg::inner_prod(residual, residual)) / norm_rhs_host) < tag.tolerance() * tag.tolerance())
+         CPU_ScalarType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+         omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+ 
+         result += alpha * p + omega * s;
+         residual = s - omega * tmp1;
+ 
+         new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+         residual_norm = viennacl::linalg::norm_2(residual);
+         if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance())
            break;
-         
-         //beta = new_ip_rr0star / ip_rr0star * alpha/omega;
-         CPU_ScalarType cpu_temp = new_ip_rr0star; //read from device only once
-         beta = cpu_temp / ip_rr0star * alpha/omega;
-         ip_rr0star = cpu_temp;
+ 
+         beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+         ip_rr0star = new_ip_rr0star;
+ 
+         if (ip_rr0star == 0 || omega == 0 || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help
+           restart_flag = true;
  
          // Execution of
          //  p = residual + beta * (p - omega*tmp0);
@@@ -197,14 -205,29 +215,38 @@@
        CPU_ScalarType beta;
        CPU_ScalarType alpha;
        CPU_ScalarType omega;
++<<<<<<< HEAD
 +      ScalarType new_ip_rr0star = 0;
 +      ScalarType inner_prod_temp; //temporary variable for inner product
 +      
 +      if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
 +        return result;
 +      
++=======
+       CPU_ScalarType new_ip_rr0star = 0;
+       CPU_ScalarType residual_norm = norm_rhs_host;
+ 
+       if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
+         return result;
+ 
+       bool restart_flag = true;
+       vcl_size_t last_restart = 0;
++>>>>>>> upstream/1.5.1
        for (unsigned int i = 0; i < tag.max_iterations(); ++i)
        {
+         if (restart_flag)
+         {
+           residual = rhs;
+           residual -= viennacl::linalg::prod(matrix, result);
+           precond.apply(residual);
+           p = residual;
+           r0star = residual;
+           ip_rr0star = viennacl::linalg::norm_2(residual);
+           ip_rr0star *= ip_rr0star;
+           restart_flag = false;
+           last_restart = i;
+         }
+ 
          tag.iters(i+1);
          tmp0 = viennacl::linalg::prod(matrix, p);
          precond.apply(tmp0);
diff --cc viennacl/linalg/bisect.hpp
index b386926,3c04917..5cb9006
--- a/viennacl/linalg/bisect.hpp
+++ b/viennacl/linalg/bisect.hpp
@@@ -2,16 -2,17 +2,27 @@@
  #define VIENNACL_LINALG_BISECT_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
@@@ -19,7 -20,7 +30,11 @@@
  
  /** @file viennacl/linalg/bisect.hpp
  *   @brief Implementation of the algorithm for finding eigenvalues of a tridiagonal matrix.
++<<<<<<< HEAD
 +* 
++=======
+ *
++>>>>>>> upstream/1.5.1
  *   Contributed by Guenther Mader and Astrid Rupp.
  */
  
@@@ -31,37 -32,37 +46,66 @@@
  
  namespace viennacl
  {
++<<<<<<< HEAD
 +  namespace linalg 
 +  {
 +    
 +    namespace detail
 +    {
 +      /** 
 +      *    @brief overloaded function for copying vectors 
++=======
+   namespace linalg
+   {
+ 
+     namespace detail
+     {
+       /**
+       *    @brief overloaded function for copying vectors
++>>>>>>> upstream/1.5.1
        */
        template <typename T, typename OtherVectorType>
        void copy_vec_to_vec(viennacl::vector<T> const & src, OtherVectorType & dest)
        {
++<<<<<<< HEAD
 +        viennacl::copy(src, dest); 
++=======
+         viennacl::copy(src, dest);
++>>>>>>> upstream/1.5.1
        }
  
        template <typename OtherVectorType, typename T>
        void copy_vec_to_vec(OtherVectorType const & src, viennacl::vector<T> & dest)
        {
++<<<<<<< HEAD
 +        viennacl::copy(src, dest); 
++=======
+         viennacl::copy(src, dest);
++>>>>>>> upstream/1.5.1
        }
  
        template <typename VectorType1, typename VectorType2>
        void copy_vec_to_vec(VectorType1 const & src, VectorType2 & dest)
        {
++<<<<<<< HEAD
 +        for (std::size_t i=0; i<src.size(); ++i)
 +          dest[i] = src[i]; 
 +      }
 +    }
 +    
 +    /** 
 +    *   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
 +    *   
++=======
+         for (vcl_size_t i=0; i<src.size(); ++i)
+           dest[i] = src[i];
+       }
+     }
+ 
+     /**
+     *   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
+     *
++>>>>>>> upstream/1.5.1
      *   @param alphas       Elements of the main diagonal
      *   @param betas        Elements of the secondary diagonal
      *   @return             Returns the eigenvalues of the tridiagonal matrix defined by alpha and beta
@@@ -69,13 -70,13 +113,23 @@@
      template< typename VectorT >
      std::vector<
              typename viennacl::result_of::cpu_value_type<typename VectorT::value_type>::type
++<<<<<<< HEAD
 +            > 
 +    bisect(VectorT const & alphas, VectorT const & betas)
 +    {
 +      typedef typename viennacl::result_of::value_type<VectorT>::type           ScalarType;
 +      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;  
 +
 +      std::size_t size = betas.size();
++=======
+             >
+     bisect(VectorT const & alphas, VectorT const & betas)
+     {
+       typedef typename viennacl::result_of::value_type<VectorT>::type           ScalarType;
+       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
+       vcl_size_t size = betas.size();
++>>>>>>> upstream/1.5.1
        std::vector<CPU_ScalarType>  x_temp(size);
  
  
@@@ -84,24 -85,24 +138,42 @@@
  
        double rel_error = std::numeric_limits<CPU_ScalarType>::epsilon();
        beta_bisect.push_back(0);
++<<<<<<< HEAD
 +    
 +      for(std::size_t i = 1; i < size; i++){
 +              beta_bisect.push_back(betas[i] * betas[i]);
 +      }
 +
 +      double xmin = alphas[size - 1] - std::abs<CPU_ScalarType>(betas[size - 1]);
 +      double xmax = alphas[size - 1] + std::abs<CPU_ScalarType>(betas[size - 1]);
 +
 +      for(std::size_t i = 0; i < size - 1; i++)
 +      {
 +        double h = std::abs<CPU_ScalarType>(betas[i]) + std::abs<CPU_ScalarType>(betas[i + 1]);
++=======
+ 
+       for(vcl_size_t i = 1; i < size; i++){
+               beta_bisect.push_back(betas[i] * betas[i]);
+       }
+ 
+       double xmin = alphas[size - 1] - std::fabs(betas[size - 1]);
+       double xmax = alphas[size - 1] + std::fabs(betas[size - 1]);
+ 
+       for(vcl_size_t i = 0; i < size - 1; i++)
+       {
+         double h = std::fabs(betas[i]) + std::fabs(betas[i + 1]);
++>>>>>>> upstream/1.5.1
          if (alphas[i] + h > xmax)
            xmax = alphas[i] + h;
          if (alphas[i] - h < xmin)
            xmin = alphas[i] - h;
        }
  
++<<<<<<< HEAD
 +      
++=======
+ 
++>>>>>>> upstream/1.5.1
        double eps1 = 1e-6;
        /*double eps2 = (xmin + xmax > 0) ? (rel_error * xmax) : (-rel_error * xmin);
        if(eps1 <= 0)
@@@ -111,13 -112,13 +183,21 @@@
  
        double x0 = xmax;
  
++<<<<<<< HEAD
 +      for(std::size_t i = 0; i < size; i++)
++=======
+       for(vcl_size_t i = 0; i < size; i++)
++>>>>>>> upstream/1.5.1
        {
          x_temp[i] = xmax;
          wu.push_back(xmin);
        }
  
++<<<<<<< HEAD
 +      for(long k = size - 1; k >= 0; --k)
++=======
+       for(long k = static_cast<long>(size) - 1; k >= 0; --k)
++>>>>>>> upstream/1.5.1
        {
          double xu = xmin;
          for(long i = k; i >= 0; --i)
@@@ -128,27 -129,27 +208,48 @@@
              break;
            }
          }
++<<<<<<< HEAD
 +        
++=======
+ 
++>>>>>>> upstream/1.5.1
          if(x0 > x_temp[k])
            x0 = x_temp[k];
  
          double x1 = (xu + x0) / 2.0;
++<<<<<<< HEAD
 +        while (x0 - xu > 2.0 * rel_error * (std::abs(xu) + std::abs(x0)) + eps1)
 +        {
 +          std::size_t a = 0;
 +          double q = 1;
 +          for(std::size_t i = 0; i < size; i++)
++=======
+         while (x0 - xu > 2.0 * rel_error * (std::fabs(xu) + std::fabs(x0)) + eps1)
+         {
+           vcl_size_t a = 0;
+           double q = 1;
+           for(vcl_size_t i = 0; i < size; i++)
++>>>>>>> upstream/1.5.1
            {
              if(q != 0)
                q = alphas[i] - x1 - beta_bisect[i] / q;
              else
++<<<<<<< HEAD
 +              q = alphas[i] - x1 - std::abs(betas[i] / rel_error);
++=======
+               q = alphas[i] - x1 - std::fabs(betas[i] / rel_error);
++>>>>>>> upstream/1.5.1
  
              if(q < 0)
                a++;
            }
++<<<<<<< HEAD
 +          
 +          if (a <= static_cast<std::size_t>(k))
++=======
+ 
+           if (a <= static_cast<vcl_size_t>(k))
++>>>>>>> upstream/1.5.1
            {
              xu = x1;
              if(a < 1)
@@@ -169,7 -170,7 +270,14 @@@
        }
        return x_temp;
      }
++<<<<<<< HEAD
 +    
 +  } // end namespace linalg
 +} // end namespace viennacl
- #endif
++#endif
++=======
+ 
+   } // end namespace linalg
+ } // end namespace viennacl
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/linalg/cg.hpp
index d02affd,e981239..c68bfb9
--- a/viennacl/linalg/cg.hpp
+++ b/viennacl/linalg/cg.hpp
@@@ -105,12 -104,12 +104,18 @@@ namespace viennac
        CPU_ScalarType alpha;
        CPU_ScalarType new_ip_rr = 0;
        CPU_ScalarType beta;
-       CPU_ScalarType norm_rhs_squared = ip_rr;
-       
+       CPU_ScalarType norm_rhs = std::sqrt(ip_rr);
+ 
        //std::cout << "Starting CG solver iterations... " << std::endl;
++<<<<<<< HEAD
 +      if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
 +        return result;
 +      
++=======
+       if (norm_rhs == 0) //solution is zero if RHS norm is zero
+         return result;
+ 
++>>>>>>> upstream/1.5.1
        for (unsigned int i = 0; i < tag.max_iterations(); ++i)
        {
          tag.iters(i+1);
@@@ -186,7 -177,7 +183,11 @@@
  
        if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
          return result;
++<<<<<<< HEAD
 +      
++=======
+ 
++>>>>>>> upstream/1.5.1
        for (unsigned int i = 0; i < tag.max_iterations(); ++i)
        {
          tag.iters(i+1);
diff --cc viennacl/linalg/detail/ilu/block_ilu.hpp
index e62fb22,406553a..76faa61
--- a/viennacl/linalg/detail/ilu/block_ilu.hpp
+++ b/viennacl/linalg/detail/ilu/block_ilu.hpp
@@@ -2,23 -2,24 +2,38 @@@
  #define VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
  /** @file viennacl/linalg/detail/ilu/block_ilu.hpp
++<<<<<<< HEAD
 +    @brief Implementations of incomplete factorization preconditioners
++=======
+     @brief Implementations of incomplete block factorization preconditioners
++>>>>>>> upstream/1.5.1
  */
  
  #include <vector>
@@@ -37,85 -38,96 +52,178 @@@ namespace viennac
    {
      namespace detail
      {
++<<<<<<< HEAD
 +      template <typename VectorType>
 +      class ilu_vector_range
 +      {
 +        public:
 +          typedef typename VectorType::value_type      value_type;
 +          typedef typename VectorType::size_type       size_type;
 +          
 +          ilu_vector_range(VectorType & v,
 +                           size_type start_index,
 +                           size_type vec_size
 +                          ) : vec_(v), start_(start_index), size_(vec_size) {}
 +          
 +          value_type & operator()(size_type index)
 +          {
 +            assert(index < size_ && "Index out of bounds!");
 +            
 +            return vec_[start_ + index];  
 +          }
 +          
 +          value_type & operator[](size_type index)
 +          {
 +            return this->operator()(index);
 +          }
 +          
 +          size_type size() const { return size_; }
 +          
 +        private:
 +          VectorType & vec_;
 +          size_type start_;
 +          size_type size_;
 +      };
 +      
 +      /** @brief Extracts a diagonal block from a larger system matrix
 +        *
 +        * @param compressed_matrix   The full matrix
 +        * @param block_matrix        The output matrix, to which the extracted block is written to
 +        * @param start_index         First row- and column-index of the block
 +        * @param stop_index          First row- and column-index beyond the block
 +        */
 +      template <typename MatrixType, typename STLMatrixType>
 +      void extract_block_matrix(MatrixType const & compressed_matrix,
 +                                STLMatrixType & block_matrix,
 +                                std::size_t start_index,
 +                                std::size_t stop_index
 +                                )
 +      {
 +        typedef typename MatrixType::const_iterator1     RowIterator;
 +        typedef typename MatrixType::const_iterator2     ColumnIterator;
 +
 +        for (RowIterator row_iter = compressed_matrix.begin1();
 +                        row_iter != compressed_matrix.end1();
 +                      ++row_iter)
 +        {
 +          if (row_iter.index1() < start_index)
 +            continue;
 +
 +          if (row_iter.index1() >= stop_index)
 +            break;
 +
 +          for (ColumnIterator col_iter = row_iter.begin();
 +                              col_iter != row_iter.end();
 +                            ++col_iter)
 +          {
 +            if (col_iter.index2() < start_index)
 +              continue;
 +
 +            if (col_iter.index2() >= static_cast<std::size_t>(stop_index))
 +              continue;
 +
 +            block_matrix[col_iter.index1() - start_index][col_iter.index2() - start_index] = *col_iter;
 +          }
 +        }
 +      }
 +          
 +      
 +    }
 +
 +    /** @brief A block ILU preconditioner class, can be supplied to solve()-routines
 +     * 
++=======
+       /** @brief Helper range class for representing a subvector of a larger buffer. */
+       template <typename VectorType, typename ValueType, typename SizeType = vcl_size_t>
+       class ilu_vector_range
+       {
+         public:
+           //typedef typename VectorType::value_type      value_type;
+           //typedef typename VectorType::size_type       size_type;
+ 
+           ilu_vector_range(VectorType & v,
+                            SizeType start_index,
+                            SizeType vec_size
+                           ) : vec_(v), start_(start_index), size_(vec_size) {}
+ 
+           ValueType & operator()(SizeType index)
+           {
+             assert(index < size_ && bool("Index out of bounds!"));
+             return vec_[start_ + index];
+           }
+ 
+           ValueType & operator[](SizeType index)
+           {
+             assert(index < size_ && bool("Index out of bounds!"));
+             return vec_[start_ + index];
+           }
+ 
+           SizeType size() const { return size_; }
+ 
+         private:
+           VectorType & vec_;
+           SizeType start_;
+           SizeType size_;
+       };
+ 
+       /** @brief Extracts a diagonal block from a larger system matrix
+         *
+         * @param A                   The full matrix
+         * @param diagonal_block_A    The output matrix, to which the extracted block is written to
+         * @param start_index         First row- and column-index of the block
+         * @param stop_index          First row- and column-index beyond the block
+         */
+       template <typename ScalarType>
+       void extract_block_matrix(viennacl::compressed_matrix<ScalarType> const & A,
+                                 viennacl::compressed_matrix<ScalarType> & diagonal_block_A,
+                                 vcl_size_t start_index,
+                                 vcl_size_t stop_index
+                                 )
+       {
+ 
+         assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+         assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+         assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ 
+         ScalarType   const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+         unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+         unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+ 
+         ScalarType   * output_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(diagonal_block_A.handle());
+         unsigned int * output_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle1());
+         unsigned int * output_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle2());
+ 
+         vcl_size_t output_counter = 0;
+         for (vcl_size_t row = start_index; row < stop_index; ++row)
+         {
+           unsigned int buffer_col_start = A_row_buffer[row];
+           unsigned int buffer_col_end   = A_row_buffer[row+1];
+ 
+           output_row_buffer[row - start_index] = static_cast<unsigned int>(output_counter);
+ 
+           for (unsigned int buf_index = buffer_col_start; buf_index < buffer_col_end; ++buf_index)
+           {
+             unsigned int col = A_col_buffer[buf_index];
+             if (col < start_index)
+               continue;
+ 
+             if (col >= static_cast<unsigned int>(stop_index))
+               continue;
+ 
+             output_col_buffer[output_counter] = static_cast<unsigned int>(col - start_index);
+             output_elements[output_counter] = A_elements[buf_index];
+             ++output_counter;
+           }
+           output_row_buffer[row - start_index + 1] = static_cast<unsigned int>(output_counter);
+         }
+       }
+ 
+ 
+     }
+ 
+     /** @brief A block ILU preconditioner class, can be supplied to solve()-routines
+      *
++>>>>>>> upstream/1.5.1
       * @tparam MatrixType   Type of the system matrix
       * @tparam ILUTag       Type of the tag identifiying the ILU preconditioner to be used on each block.
      */
@@@ -123,31 -135,30 +231,58 @@@
      class block_ilu_precond
      {
        typedef typename MatrixType::value_type      ScalarType;
++<<<<<<< HEAD
 +      typedef std::vector< std::map<unsigned int, ScalarType> >   InternalMatrixType;
 +      
 +      public:
 +        typedef std::vector<std::pair<std::size_t, std::size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
 +        
 +        
 +        block_ilu_precond(MatrixType const & mat,
 +                          ILUTag const & tag,
 +                          std::size_t num_blocks = 4
 +                         ) : tag_(tag), LU_blocks(num_blocks)
 +        {
 +          
 +          // Set up vector of block indices:
 +          block_indices_.resize(num_blocks);
 +          for (std::size_t i=0; i<num_blocks; ++i)
 +          {
 +            std::size_t start_index = (   i  * mat.size1()) / num_blocks;
 +            std::size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
 +            
 +            block_indices_[i] = std::pair<std::size_t, std::size_t>(start_index, stop_index);
 +          }
 +          
 +          //initialize preconditioner:
 +          //std::cout << "Start CPU precond" << std::endl;
 +          init(mat);          
++=======
+ 
+       public:
+         typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
+ 
+ 
+         block_ilu_precond(MatrixType const & mat,
+                           ILUTag const & tag,
+                           vcl_size_t num_blocks = 8
+                          ) : tag_(tag), LU_blocks(num_blocks)
+         {
+ 
+           // Set up vector of block indices:
+           block_indices_.resize(num_blocks);
+           for (vcl_size_t i=0; i<num_blocks; ++i)
+           {
+             vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+             vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+ 
+             block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+           }
+ 
+           //initialize preconditioner:
+           //std::cout << "Start CPU precond" << std::endl;
+           init(mat);
++>>>>>>> upstream/1.5.1
            //std::cout << "End CPU precond" << std::endl;
          }
  
@@@ -158,53 -169,85 +293,135 @@@
          {
            //initialize preconditioner:
            //std::cout << "Start CPU precond" << std::endl;
++<<<<<<< HEAD
 +          init(mat);          
 +          //std::cout << "End CPU precond" << std::endl;
 +        }
 +        
 +        
 +        template <typename VectorType>
 +        void apply(VectorType & vec) const
 +        {
 +          for (std::size_t i=0; i<block_indices_.size(); ++i)
 +          {
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU_blocks[i],
 +                                                                                      LU_blocks[i].size(),
 +                                                                                      LU_blocks[i].size());
 +            detail::ilu_vector_range<VectorType>  vec_range(vec,
 +                                                            block_indices_[i].first,
 +                                                            LU_blocks[i].size());
 +            viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec_range);
 +          }
 +        }
 +        
 +      private:
 +        void init(MatrixType const & mat)
 +        {
 +          
 +          for (std::size_t i=0; i<block_indices_.size(); ++i)
 +          {
 +            // Step 1: Extract blocks
 +            std::size_t block_size = block_indices_[i].second - block_indices_[i].first;
 +            InternalMatrixType mat_block(block_size);
 +            detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
 +            
 +            
 +            // Step 2: Precondition blocks:
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType>  mat_block_adapter(mat_block, block_size, block_size);
 +            viennacl::tools::sparse_matrix_adapter<ScalarType>        LU_adapter(LU_blocks[i], block_size, block_size);
 +            viennacl::linalg::precondition(mat_block_adapter, LU_adapter, tag_);
 +          }
 +          
 +        }
 +
 +        
 +        ILUTag const & tag_;
 +        index_vector_type block_indices_;
 +        std::vector< InternalMatrixType > LU_blocks;
 +    };
 +
 +    
++=======
+           init(mat);
+           //std::cout << "End CPU precond" << std::endl;
+         }
+ 
+ 
+         template <typename VectorType>
+         void apply(VectorType & vec) const
+         {
+           for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+           {
+             detail::ilu_vector_range<VectorType, ScalarType>  vec_range(vec, block_indices_[i].first, LU_blocks[i].size2());
+ 
+             unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_blocks[i].handle1());
+             unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_blocks[i].handle2());
+             ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU_blocks[i].handle());
+ 
+             viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, LU_blocks[i].size2(), unit_lower_tag());
+             viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, LU_blocks[i].size2(), upper_tag());
+ 
+           }
+         }
+ 
+       private:
+         void init(MatrixType const & A)
+         {
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           viennacl::compressed_matrix<ScalarType> mat(host_context);
+ 
+           viennacl::copy(A, mat);
+ 
+           unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+ 
+ #ifdef VIENNACL_WITH_OPENMP
+           #pragma omp parallel for
+ #endif
+           for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+           {
+             // Step 1: Extract blocks
+             vcl_size_t block_size = block_indices_[i].second - block_indices_[i].first;
+             vcl_size_t block_nnz  = row_buffer[block_indices_[i].second] - row_buffer[block_indices_[i].first];
+             viennacl::compressed_matrix<ScalarType> mat_block(block_size, block_size, block_nnz, host_context);
+ 
+             detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
+ 
+             // Step 2: Precondition blocks:
+             viennacl::switch_memory_context(LU_blocks[i], host_context);
+             preconditioner_dispatch(mat_block, LU_blocks[i], tag_);
+           }
+ 
+         }
+ 
+         void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                      viennacl::compressed_matrix<ScalarType> & LU,
+                                      viennacl::linalg::ilu0_tag)
+         {
+           LU = mat_block;
+           viennacl::linalg::precondition(LU, tag_);
+         }
+ 
+         void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                      viennacl::compressed_matrix<ScalarType> & LU,
+                                      viennacl::linalg::ilut_tag)
+         {
+           std::vector< std::map<unsigned int, ScalarType> > temp(mat_block.size1());
+ 
+           viennacl::linalg::precondition(mat_block, temp, tag_);
+ 
+           viennacl::copy(temp, LU);
+         }
+ 
+         ILUTag const & tag_;
+         index_vector_type block_indices_;
+         std::vector< viennacl::compressed_matrix<ScalarType> > LU_blocks;
+     };
+ 
+ 
+ 
+ 
+ 
++>>>>>>> upstream/1.5.1
      /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
      *
      *  Specialization for compressed_matrix
@@@ -213,106 -256,201 +430,301 @@@
      class block_ilu_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT>, ILUTag >
      {
          typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>        MatrixType;
++<<<<<<< HEAD
 +        typedef std::vector< std::map<unsigned int, ScalarType> >   InternalMatrixType;
 +        typedef std::vector<ScalarType>                             STLVectorType;
 +      
 +      public:
 +        typedef std::vector<std::pair<std::size_t, std::size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
 +          
 +        
 +        
 +        block_ilu_precond(MatrixType const & mat,
 +                          ILUTag const & tag,
 +                          std::size_t num_blocks = 4
 +                         ) : tag_(tag), LU_blocks(num_blocks)
 +        {
 +          
 +          // Set up vector of block indices:
 +          block_indices_.resize(num_blocks);
 +          for (std::size_t i=0; i<num_blocks; ++i)
 +          {
 +            std::size_t start_index = (   i  * mat.size1()) / num_blocks;
 +            std::size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
 +            
 +            block_indices_[i] = std::pair<std::size_t, std::size_t>(start_index, stop_index);
 +          }
 +          
 +          //initialize preconditioner:
 +          //std::cout << "Start CPU precond" << std::endl;
 +          init(mat);          
++=======
+         //typedef std::vector<ScalarType>                             STLVectorType;
+ 
+       public:
+         typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
+ 
+ 
+         block_ilu_precond(MatrixType const & mat,
+                           ILUTag const & tag,
+                           vcl_size_t num_blocks = 8
+                          ) : tag_(tag),
+                              block_indices_(num_blocks),
+                              gpu_block_indices(),
+                              gpu_L_trans(0,0, viennacl::traits::context(mat)),
+                              gpu_U_trans(0,0, viennacl::traits::context(mat)),
+                              gpu_D(mat.size1(), viennacl::traits::context(mat)),
+                              LU_blocks(num_blocks)
+         {
+           // Set up vector of block indices:
+           block_indices_.resize(num_blocks);
+           for (vcl_size_t i=0; i<num_blocks; ++i)
+           {
+             vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+             vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+ 
+             block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+           }
+ 
+           //initialize preconditioner:
+           //std::cout << "Start CPU precond" << std::endl;
+           init(mat);
++>>>>>>> upstream/1.5.1
            //std::cout << "End CPU precond" << std::endl;
          }
  
          block_ilu_precond(MatrixType const & mat,
                            ILUTag const & tag,
                            index_vector_type const & block_boundaries
++<<<<<<< HEAD
 +                         ) : tag_(tag), block_indices_(block_boundaries), LU_blocks(block_boundaries.size())
 +        {
 +          //initialize preconditioner:
 +          //std::cout << "Start CPU precond" << std::endl;
 +          init(mat);          
 +          //std::cout << "End CPU precond" << std::endl;
 +        }
 +        
 +        
 +        void apply(vector<ScalarType> & vec) const
 +        {
 +          viennacl::copy(vec, temp_vec);
 +          
 +          for (std::size_t i=0; i<block_indices_.size(); ++i)
 +          {
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU_blocks[i],
 +                                                                                      LU_blocks[i].size(),
 +                                                                                      LU_blocks[i].size());
 +            detail::ilu_vector_range<STLVectorType>  vec_range(temp_vec,
 +                                                            block_indices_[i].first,
 +                                                            LU_blocks[i].size());
 +            viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec_range);
 +          }
 +                    
 +          viennacl::copy(temp_vec, vec);
 +        }
 +        
 +      private:
 +        void init(MatrixType const & mat)
 +        {
 +          InternalMatrixType temp(mat.size1());
 +          //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
 +
 +          //copy to cpu:
 +          viennacl::copy(mat, temp);
 +          
 +          for (std::size_t i=0; i<block_indices_.size(); ++i)
 +          {
 +            // Step 1: Extract blocks
 +            std::size_t block_size = block_indices_[i].second - block_indices_[i].first;
 +            InternalMatrixType mat_block(block_size);
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType>  temp_adapter(temp, temp.size(), temp.size());
 +            detail::extract_block_matrix(temp_adapter, mat_block, block_indices_[i].first, block_indices_[i].second);
 +            
 +            
 +            // Step 2: Precondition blocks:
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType>  mat_block_adapter(mat_block, block_size, block_size);
 +            viennacl::tools::sparse_matrix_adapter<ScalarType>        LU_adapter(LU_blocks[i], block_size, block_size);
 +            viennacl::linalg::precondition(mat_block_adapter, LU_adapter, tag_);
 +          }
 +          
 +          //viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp, temp.size(), temp.size());
 +          //viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
 +          //viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
 +          
 +          temp_vec.resize(mat.size1());
 +          
 +          //copy resulting preconditioner back to gpu:
 +          //copy(LU_cpu, LU);
 +        }
 +        
 +        ILUTag const & tag_;
 +        index_vector_type block_indices_;
 +        std::vector< InternalMatrixType > LU_blocks;
 +        mutable STLVectorType temp_vec;
 +    };
 +
++=======
+                          ) : tag_(tag),
+                              block_indices_(block_boundaries),
+                              gpu_block_indices(viennacl::traits::context(mat)),
+                              gpu_L_trans(0,0,viennacl::traits::context(mat)),
+                              gpu_U_trans(0,0,viennacl::traits::context(mat)),
+                              gpu_D(0,viennacl::traits::context(mat)),
+                              LU_blocks(block_boundaries.size())
+         {
+           //initialize preconditioner:
+           //std::cout << "Start CPU precond" << std::endl;
+           init(mat);
+           //std::cout << "End CPU precond" << std::endl;
+         }
+ 
+ 
+         void apply(vector<ScalarType> & vec) const
+         {
+           viennacl::linalg::detail::block_inplace_solve(trans(gpu_L_trans), gpu_block_indices, block_indices_.size(), gpu_D,
+                                                         vec,
+                                                         viennacl::linalg::unit_lower_tag());
+ 
+           viennacl::linalg::detail::block_inplace_solve(trans(gpu_U_trans), gpu_block_indices, block_indices_.size(), gpu_D,
+                                                         vec,
+                                                         viennacl::linalg::upper_tag());
+ 
+           //apply_cpu(vec);
+         }
+ 
+ 
+       private:
+ 
+         void init(MatrixType const & A)
+         {
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           viennacl::compressed_matrix<ScalarType> mat(host_context);
+ 
+           mat = A;
+ 
+           unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+ 
+ #ifdef VIENNACL_WITH_OPENMP
+           #pragma omp parallel for
+ #endif
+           for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+           {
+             // Step 1: Extract blocks
+             vcl_size_t block_size = block_indices_[i].second - block_indices_[i].first;
+             vcl_size_t block_nnz  = row_buffer[block_indices_[i].second] - row_buffer[block_indices_[i].first];
+             viennacl::compressed_matrix<ScalarType> mat_block(block_size, block_size, block_nnz, host_context);
+ 
+             detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
+ 
+             // Step 2: Precondition blocks:
+             viennacl::switch_memory_context(LU_blocks[i], host_context);
+             preconditioner_dispatch(mat_block, LU_blocks[i], tag_);
+           }
+ 
+           /*
+            * copy resulting preconditioner back to GPU:
+            */
+ 
+           viennacl::switch_memory_context(gpu_L_trans, viennacl::traits::context(A));
+           viennacl::switch_memory_context(gpu_U_trans, viennacl::traits::context(A));
+           viennacl::switch_memory_context(gpu_D, viennacl::traits::context(A));
+ 
+           viennacl::backend::typesafe_host_array<unsigned int> block_indices_uint(gpu_block_indices, 2 * block_indices_.size());
+           for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+           {
+             block_indices_uint.set(2*i, block_indices_[i].first);
+             block_indices_uint.set(2*i + 1, block_indices_[i].second);
+           }
+ 
+           viennacl::backend::memory_create(gpu_block_indices, block_indices_uint.raw_size(), viennacl::traits::context(A), block_indices_uint.get());
+ 
+           blocks_to_device(mat.size1());
+ 
+         }
+ 
+         // Copy computed preconditioned blocks to OpenCL device
+         void blocks_to_device(vcl_size_t matrix_size)
+         {
+           std::vector< std::map<unsigned int, ScalarType> > L_transposed(matrix_size);
+           std::vector< std::map<unsigned int, ScalarType> > U_transposed(matrix_size);
+           std::vector<ScalarType> entries_D(matrix_size);
+ 
+           //
+           // Transpose individual blocks into a single large matrix:
+           //
+           for (vcl_size_t block_index = 0; block_index < LU_blocks.size(); ++block_index)
+           {
+             MatrixType const & current_block = LU_blocks[block_index];
+ 
+             unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(current_block.handle1());
+             unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(current_block.handle2());
+             ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(current_block.handle());
+ 
+             vcl_size_t block_start = block_indices_[block_index].first;
+ 
+             //transpose L and U:
+             for (vcl_size_t row = 0; row < current_block.size1(); ++row)
+             {
+               unsigned int buffer_col_start = row_buffer[row];
+               unsigned int buffer_col_end   = row_buffer[row+1];
+ 
+               for (unsigned int buf_index = buffer_col_start; buf_index < buffer_col_end; ++buf_index)
+               {
+                 unsigned int col = col_buffer[buf_index];
+ 
+                 if (row > col) //entry for L
+                   L_transposed[col + block_start][static_cast<unsigned int>(row + block_start)] = elements[buf_index];
+                 else if (row == col)
+                   entries_D[row + block_start] = elements[buf_index];
+                 else //entry for U
+                   U_transposed[col + block_start][static_cast<unsigned int>(row + block_start)] = elements[buf_index];
+               }
+             }
+           }
+ 
+           //
+           // Move data to GPU:
+           //
+           tools::const_sparse_matrix_adapter<ScalarType, unsigned int> adapted_L_transposed(L_transposed, matrix_size, matrix_size);
+           tools::const_sparse_matrix_adapter<ScalarType, unsigned int> adapted_U_transposed(U_transposed, matrix_size, matrix_size);
+           viennacl::copy(adapted_L_transposed, gpu_L_trans);
+           viennacl::copy(adapted_U_transposed, gpu_U_trans);
+           viennacl::copy(entries_D, gpu_D);
+         }
+ 
+         void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                      viennacl::compressed_matrix<ScalarType> & LU,
+                                      viennacl::linalg::ilu0_tag)
+         {
+           LU = mat_block;
+           viennacl::linalg::precondition(LU, tag_);
+         }
+ 
+         void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                      viennacl::compressed_matrix<ScalarType> & LU,
+                                      viennacl::linalg::ilut_tag)
+         {
+           std::vector< std::map<unsigned int, ScalarType> > temp(mat_block.size1());
+ 
+           viennacl::linalg::precondition(mat_block, temp, tag_);
+ 
+           viennacl::copy(temp, LU);
+         }
+ 
+ 
+         ILUTag const & tag_;
+         index_vector_type block_indices_;
+         viennacl::backend::mem_handle gpu_block_indices;
+         viennacl::compressed_matrix<ScalarType> gpu_L_trans;
+         viennacl::compressed_matrix<ScalarType> gpu_U_trans;
+         viennacl::vector<ScalarType> gpu_D;
+ 
+         std::vector< MatrixType > LU_blocks;
+     };
+ 
+ 
++>>>>>>> upstream/1.5.1
    }
  }
  
diff --cc viennacl/linalg/detail/ilu/common.hpp
index c323638,e66e362..0665325
--- a/viennacl/linalg/detail/ilu/common.hpp
+++ b/viennacl/linalg/detail/ilu/common.hpp
@@@ -2,16 -2,17 +2,27 @@@
  #define VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
@@@ -23,10 -24,16 +34,23 @@@
  
  #include <vector>
  #include <cmath>
++<<<<<<< HEAD
 +#include "viennacl/forwards.h"
 +#include "viennacl/tools/tools.hpp"
 +
 +#include <map>
++=======
+ #include <iostream>
+ #include <map>
+ #include <list>
+ 
+ #include "viennacl/forwards.h"
+ #include "viennacl/tools/tools.hpp"
+ #include "viennacl/backend/memory.hpp"
+ 
+ #include "viennacl/linalg/host_based/common.hpp"
+ #include "viennacl/linalg/misc_operations.hpp"
++>>>>>>> upstream/1.5.1
  
  namespace viennacl
  {
@@@ -34,107 -41,215 +58,319 @@@
    {
      namespace detail
      {
++<<<<<<< HEAD
 +    
 +      /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
 +      * 
 +      * Generic implementation using the iterator concept from boost::numeric::ublas. Could not find a better way for sparse matrices...
 +      *
 +      * @param row_iter   The row iterator
 +      * @param k      The final row index
 +      */
 +      template <typename T>
 +      void ilu_inc_row_iterator_to_row_index(T & row_iter, unsigned int k)
 +      {
 +        while (row_iter.index1() < k)
 +          ++row_iter;
 +      }
 +      
 +      /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
 +      * 
 +      * Specialization for the sparse matrix adapter shipped with ViennaCL
 +      *
 +      * @param row_iter   The row iterator
 +      * @param k      The final row index
 +      */
 +      template <typename ScalarType>
 +      void ilu_inc_row_iterator_to_row_index(viennacl::tools::sparse_matrix_adapter<ScalarType> & row_iter, unsigned int k)
 +      {
 +        row_iter += k - row_iter.index1();
 +      }
 +      
 +      /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
 +      * 
 +      * Specialization for the const sparse matrix adapter shipped with ViennaCL
 +      *
 +      * @param row_iter   The row iterator
 +      * @param k      The final row index
 +      */
 +      template <typename ScalarType>
 +      void ilu_inc_row_iterator_to_row_index(viennacl::tools::const_sparse_matrix_adapter<ScalarType> & row_iter, unsigned int k)
 +      {
 +        row_iter += k - row_iter.index1();
 +      }
 +
 +      /** @brief Generic inplace solution of a unit lower triangular system
 +      *   
 +      * @param mat  The system matrix
 +      * @param vec  The right hand side vector
 +      */
 +      template<typename MatrixType, typename VectorType>
 +      void ilu_inplace_solve(MatrixType const & mat, VectorType & vec, viennacl::linalg::unit_lower_tag)
 +      {
 +        typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
 +        typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
 +        
 +        for (InputRowIterator row_iter = mat.begin1(); row_iter != mat.end1(); ++row_iter)
 +        {
 +          for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
 +          {
 +            if (col_iter.index2() < col_iter.index1())
 +              vec[col_iter.index1()] -= *col_iter * vec[col_iter.index2()];
 +          }
 +        }
 +      }
 +
 +      /** @brief Generic inplace solution of a upper triangular system
 +      *   
 +      * @param mat  The system matrix
 +      * @param vec  The right hand side vector
 +      */
 +      template<typename MatrixType, typename VectorType>
 +      void ilu_inplace_solve(MatrixType const & mat, VectorType & vec, viennacl::linalg::upper_tag)
 +      {
 +        typedef typename MatrixType::const_reverse_iterator1    InputRowIterator;  //iterate along increasing row index
 +        typedef typename MatrixType::const_iterator2            InputColIterator;  //iterate along increasing column index
 +        typedef typename VectorType::value_type                 ScalarType;
 +        
 +        ScalarType diagonal_entry = 1.0;
 +        
 +        for (InputRowIterator row_iter = mat.rbegin1(); row_iter != mat.rend1(); ++row_iter)
 +        {
 +          for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
 +          {
 +            if (col_iter.index2() > col_iter.index1())
 +              vec[col_iter.index1()] -= *col_iter * vec[col_iter.index2()];
 +            if (col_iter.index2() == col_iter.index1())
 +              diagonal_entry = *col_iter;
 +          }
 +          vec[row_iter.index1()] /= diagonal_entry;
 +        }
 +      }
 +
 +      /** @brief Generic LU substitution
 +      *   
 +      * @param mat  The system matrix
 +      * @param vec  The right hand side vector
 +      */
 +      template<typename MatrixType, typename VectorType>
 +      void ilu_lu_substitute(MatrixType const & mat, VectorType & vec)
 +      {
 +        ilu_inplace_solve(mat, vec, unit_lower_tag());
 +        ilu_inplace_solve(mat, vec, upper_tag());
 +      }
 +
++=======
+ 
+ 
+       //
+       // Level Scheduling Setup for ILU:
+       //
+ 
+       template <typename ScalarType, unsigned int ALIGNMENT>
+       void level_scheduling_setup_impl(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & LU,
+                                        vector<ScalarType> const & diagonal_LU,
+                                        std::list< viennacl::backend::mem_handle > & row_index_arrays,
+                                        std::list< viennacl::backend::mem_handle > & row_buffers,
+                                        std::list< viennacl::backend::mem_handle > & col_buffers,
+                                        std::list< viennacl::backend::mem_handle > & element_buffers,
+                                        std::list< vcl_size_t > & row_elimination_num_list,
+                                        bool setup_U)
+       {
+         ScalarType   const * diagonal_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(diagonal_LU.handle());
+         ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU.handle());
+         unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+         unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+ 
+         //
+         // Step 1: Determine row elimination order for each row and build up meta information about the number of entries taking part in each elimination step:
+         //
+         std::vector<vcl_size_t> row_elimination(LU.size1());
+         std::map<vcl_size_t, std::map<vcl_size_t, vcl_size_t> > row_entries_per_elimination_step;
+ 
+         vcl_size_t max_elimination_runs = 0;
+         for (vcl_size_t row2 = 0; row2 < LU.size1(); ++row2)
+         {
+           vcl_size_t row = setup_U ? (LU.size1() - row2) - 1 : row2;
+ 
+           vcl_size_t row_begin = row_buffer[row];
+           vcl_size_t row_end   = row_buffer[row+1];
+           vcl_size_t elimination_index = 0;  //Note: first run corresponds to elimination_index = 1 (otherwise, type issues with int <-> unsigned int would arise
+           for (vcl_size_t i = row_begin; i < row_end; ++i)
+           {
+             unsigned int col = col_buffer[i];
+             if ( (!setup_U && col < row) || (setup_U && col > row) )
+             {
+               elimination_index = std::max<vcl_size_t>(elimination_index, row_elimination[col]);
+               row_entries_per_elimination_step[row_elimination[col]][row] += 1;
+             }
+           }
+           row_elimination[row] = elimination_index + 1;
+           max_elimination_runs = std::max<vcl_size_t>(max_elimination_runs, elimination_index + 1);
+         }
+ 
+         //std::cout << "Number of elimination runs: " << max_elimination_runs << std::endl;
+ 
+         //
+         // Step 2: Build row-major elimination matrix for each elimination step
+         //
+ 
+         //std::cout << "Elimination order: " << std::endl;
+         //for (vcl_size_t i=0; i<row_elimination.size(); ++i)
+         //  std::cout << row_elimination[i] << ", ";
+         //std::cout << std::endl;
+ 
+         //vcl_size_t summed_rows = 0;
+         for (vcl_size_t elimination_run = 1; elimination_run <= max_elimination_runs; ++elimination_run)
+         {
+           std::map<vcl_size_t, vcl_size_t> const & current_elimination_info = row_entries_per_elimination_step[elimination_run];
+ 
+           // count cols and entries handled in this elimination step
+           vcl_size_t num_tainted_cols = current_elimination_info.size();
+           vcl_size_t num_entries = 0;
+ 
+           for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                                   it != current_elimination_info.end();
+                                                                 ++it)
+             num_entries += it->second;
+ 
+           //std::cout << "num_entries: " << num_entries << std::endl;
+           //std::cout << "num_tainted_cols: " << num_tainted_cols << std::endl;
+ 
+           if (num_tainted_cols > 0)
+           {
+             row_index_arrays.push_back(viennacl::backend::mem_handle());
+             viennacl::backend::switch_memory_context<unsigned int>(row_index_arrays.back(), viennacl::traits::context(LU));
+             viennacl::backend::typesafe_host_array<unsigned int> elim_row_index_array(row_index_arrays.back(), num_tainted_cols);
+ 
+             row_buffers.push_back(viennacl::backend::mem_handle());
+             viennacl::backend::switch_memory_context<unsigned int>(row_buffers.back(), viennacl::traits::context(LU));
+             viennacl::backend::typesafe_host_array<unsigned int> elim_row_buffer(row_buffers.back(), num_tainted_cols + 1);
+ 
+             col_buffers.push_back(viennacl::backend::mem_handle());
+             viennacl::backend::switch_memory_context<unsigned int>(col_buffers.back(), viennacl::traits::context(LU));
+             viennacl::backend::typesafe_host_array<unsigned int> elim_col_buffer(col_buffers.back(), num_entries);
+ 
+             element_buffers.push_back(viennacl::backend::mem_handle());
+             viennacl::backend::switch_memory_context<ScalarType>(element_buffers.back(), viennacl::traits::context(LU));
+             std::vector<ScalarType> elim_elements_buffer(num_entries);
+ 
+             row_elimination_num_list.push_back(num_tainted_cols);
+ 
+             vcl_size_t k=0;
+             vcl_size_t nnz_index = 0;
+             elim_row_buffer.set(0, 0);
+ 
+             for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                                     it != current_elimination_info.end();
+                                                                   ++it)
+             {
+               //vcl_size_t col = setup_U ? (elimination_matrix.size() - it->first) - 1 : col2;
+               vcl_size_t row = it->first;
+               elim_row_index_array.set(k, row);
+ 
+               vcl_size_t row_begin = row_buffer[row];
+               vcl_size_t row_end   = row_buffer[row+1];
+               for (vcl_size_t i = row_begin; i < row_end; ++i)
+               {
+                 unsigned int col = col_buffer[i];
+                 if ( (!setup_U && col < row) || (setup_U && col > row) ) //entry of L/U
+                 {
+                   if (row_elimination[col] == elimination_run) // this entry is substituted in this run
+                   {
+                     elim_col_buffer.set(nnz_index, col);
+                     elim_elements_buffer[nnz_index] = setup_U ? elements[i] / diagonal_buf[it->first] : elements[i];
+                     ++nnz_index;
+                   }
+                 }
+               }
+ 
+               elim_row_buffer.set(++k, nnz_index);
+             }
+ 
+             //
+             // Wrap in memory_handles:
+             //
+             viennacl::backend::memory_create(row_index_arrays.back(), elim_row_index_array.raw_size(),                  viennacl::traits::context(row_index_arrays.back()), elim_row_index_array.get());
+             viennacl::backend::memory_create(row_buffers.back(),      elim_row_buffer.raw_size(),                       viennacl::traits::context(row_buffers.back()),      elim_row_buffer.get());
+             viennacl::backend::memory_create(col_buffers.back(),      elim_col_buffer.raw_size(),                       viennacl::traits::context(col_buffers.back()),      elim_col_buffer.get());
+             viennacl::backend::memory_create(element_buffers.back(),  sizeof(ScalarType) * elim_elements_buffer.size(), viennacl::traits::context(element_buffers.back()),  &(elim_elements_buffer[0]));
+           }
+ 
+           // Print some info:
+           //std::cout << "Eliminated columns in run " << elimination_run << ": " << num_tainted_cols << " (tainted columns: " << num_tainted_cols << ")" << std::endl;
+           //summed_rows += eliminated_rows_in_run;
+           //if (eliminated_rows_in_run == 0)
+           //  break;
+         }
+         //std::cout << "Eliminated rows: " << summed_rows << " out of " << row_elimination.size() << std::endl;
+       }
+ 
+ 
+       template <typename ScalarType, unsigned int ALIGNMENT>
+       void level_scheduling_setup_L(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & LU,
+                                 vector<ScalarType> const & diagonal_LU,
+                                 std::list< viennacl::backend::mem_handle > & row_index_arrays,
+                                 std::list< viennacl::backend::mem_handle > & row_buffers,
+                                 std::list< viennacl::backend::mem_handle > & col_buffers,
+                                 std::list< viennacl::backend::mem_handle > & element_buffers,
+                                 std::list< vcl_size_t > & row_elimination_num_list)
+       {
+         level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, false);
+       }
+ 
+ 
+       //
+       // Multifrontal setup of U:
+       //
+ 
+       template <typename ScalarType, unsigned int ALIGNMENT>
+       void level_scheduling_setup_U(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & LU,
+                                 vector<ScalarType> const & diagonal_LU,
+                                 std::list< viennacl::backend::mem_handle > & row_index_arrays,
+                                 std::list< viennacl::backend::mem_handle > & row_buffers,
+                                 std::list< viennacl::backend::mem_handle > & col_buffers,
+                                 std::list< viennacl::backend::mem_handle > & element_buffers,
+                                 std::list< vcl_size_t > & row_elimination_num_list)
+       {
+         level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, true);
+       }
+ 
+ 
+       //
+       // Multifrontal substitution (both L and U). Will partly be moved to single_threaded/opencl/cuda implementations
+       //
+       template <typename ScalarType>
+       void level_scheduling_substitute(vector<ScalarType> & vec,
+                                        std::list< viennacl::backend::mem_handle > const & row_index_arrays,
+                                        std::list< viennacl::backend::mem_handle > const & row_buffers,
+                                        std::list< viennacl::backend::mem_handle > const & col_buffers,
+                                        std::list< viennacl::backend::mem_handle > const & element_buffers,
+                                        std::list< vcl_size_t > const & row_elimination_num_list)
+       {
+         typedef typename std::list< viennacl::backend::mem_handle >::const_iterator  ListIterator;
+         ListIterator row_index_array_it = row_index_arrays.begin();
+         ListIterator row_buffers_it = row_buffers.begin();
+         ListIterator col_buffers_it = col_buffers.begin();
+         ListIterator element_buffers_it = element_buffers.begin();
+         typename std::list< vcl_size_t>::const_iterator row_elimination_num_it = row_elimination_num_list.begin();
+         for (vcl_size_t i=0; i<row_index_arrays.size(); ++i)
+         {
+           viennacl::linalg::detail::level_scheduling_substitute(vec, *row_index_array_it, *row_buffers_it, *col_buffers_it, *element_buffers_it, *row_elimination_num_it);
+ 
+           ++row_index_array_it;
+           ++row_buffers_it;
+           ++col_buffers_it;
+           ++element_buffers_it;
+           ++row_elimination_num_it;
+         }
+       }
+ 
+ 
+ 
+ 
+ 
++>>>>>>> upstream/1.5.1
      } // namespace detail
    } // namespace linalg
  } // namespace viennacl
diff --cc viennacl/linalg/detail/ilu/ilu0.hpp
index b1ba7ae,d9b11ed..0bc923f
--- a/viennacl/linalg/detail/ilu/ilu0.hpp
+++ b/viennacl/linalg/detail/ilu/ilu0.hpp
@@@ -3,39 -3,49 +3,82 @@@
  #define VIENNACL_LINALG_DETAIL_ILU0_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2011, Institute for Microelectronics,
 +   Institute for Analysis and Scientific Computing,
 +   TU Wien.
 +
 +   -----------------
 +   ViennaCL - The Vienna Computing Library
 +   -----------------
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
++>>>>>>> upstream/1.5.1
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
  
     (A list of authors and contributors can be found in the PDF manual)
  
++<<<<<<< HEAD
 +License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file viennacl/linalg/detail/ilu/ilu0.hpp
 +  @brief Implementations of incomplete factorization preconditioners with static nonzero pattern. Contributed by Evan Bollig.
 +
 +  ILU0 (Incomplete LU with zero fill-in) 
 +  - All preconditioner nonzeros exist at locations that were nonzero in the input matrix. 
 +  - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
 +
 + Evan Bollig 3/30/12
 + 
 + Adapted from viennacl/linalg/detail/ilut.hpp
 +
++=======
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/linalg/detail/ilu/ilu0.hpp
+   @brief Implementations of incomplete factorization preconditioners with static nonzero pattern.
+ 
+   Contributed by Evan Bollig.
+ 
+   ILU0 (Incomplete LU with zero fill-in)
+   - All preconditioner nonzeros exist at locations that were nonzero in the input matrix.
+   - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
+ 
+  Evan Bollig 3/30/12
+ 
+  Adapted from viennacl/linalg/detail/ilut.hpp
+ 
+  Low-level reimplementation by Karl Rupp in Nov 2012, increasing performance substantially. Also added level-scheduling.
+ 
++>>>>>>> upstream/1.5.1
  */
  
  #include <vector>
  #include <cmath>
++<<<<<<< HEAD
++#include "viennacl/forwards.h"
++#include "viennacl/tools/tools.hpp"
++#include "viennacl/linalg/detail/ilu/common.hpp"
++=======
+ #include <iostream>
  #include "viennacl/forwards.h"
  #include "viennacl/tools/tools.hpp"
  #include "viennacl/linalg/detail/ilu/common.hpp"
+ #include "viennacl/compressed_matrix.hpp"
+ #include "viennacl/backend/memory.hpp"
+ 
+ #include "viennacl/linalg/host_based/common.hpp"
++>>>>>>> upstream/1.5.1
  
  #include <map>
  
@@@ -44,121 -54,94 +87,211 @@@ namespace viennac
    namespace linalg
    {
  
++<<<<<<< HEAD
 +    /** @brief A tag for incomplete LU factorization with threshold (ILUT)
++=======
+     /** @brief A tag for incomplete LU factorization with static pattern (ILU0)
++>>>>>>> upstream/1.5.1
      */
      class ilu0_tag
      {
        public:
++<<<<<<< HEAD
 +        /** @brief The constructor.
 +          *
 +          * @param row_start     The starting row for the block to which we apply ILU
 +          * @param row_end       The end column of the block to which we apply ILU
 +          */
 +        ilu0_tag(unsigned int row_start = 0, unsigned int row_end = -1)
 +            : _row_start(row_start),  
 +            _row_end(row_end) {}
 +              
 +      public: 
 +        unsigned int _row_start, _row_end;
 +    };
 +
 +
 +    /** @brief Implementation of a ILU-preconditioner with static pattern
 +      *
 +      * refer to the Algorithm in Saad's book (1996 edition)
 +      *
 +      *  @param input   The input matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns
 +      *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
 +      *  @param tag     An ilu0_tag in order to dispatch among several other preconditioners.
 +      */
 +    template<typename MatrixType, typename LUType>
 +    void precondition(MatrixType const & input, LUType & output, ilu0_tag const & tag)
 +    {
 +      typedef std::map<unsigned int, double>          SparseVector;
 +      typedef typename SparseVector::iterator         SparseVectorIterator;
 +      typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
 +      typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
 +      typedef typename LUType::iterator1              OutputRowIterator;  //iterate along increasing row index
 +      typedef typename LUType::iterator2              OutputColIterator;  //iterate along increasing column index
 +
 +      output.clear();
 +      assert(input.size1() == output.size1());
 +      assert(input.size2() == output.size2());
 +      output.resize(static_cast<unsigned int>(input.size1()), static_cast<unsigned int>(input.size2()), false);
 +      SparseVector w;
 +
 +
 +      std::map<double, unsigned int> temp_map;
 +
 +      // For i = 2, ... , N, DO
 +      for (InputRowIterator row_iter = input.begin1(); row_iter != input.end1(); ++row_iter)
 +      {
 +        w.clear();
 +        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
 +        {
 +          // Only work on the block described by (row_start:row_end, row_start:row_end)
 +          if ((static_cast<unsigned int>(row_iter.index1()) >= tag._row_start) && (static_cast<unsigned int>(row_iter.index1()) < tag._row_end))
 +          {
 +              if ((static_cast<unsigned int>(col_iter.index2()) >= tag._row_start) && (static_cast<unsigned int>(col_iter.index2()) < tag._row_end))
 +              {
 +                  w[static_cast<unsigned int>(col_iter.index2())] = *col_iter;
 +              }
 +          } 
 +          else 
 +          {
 +              // Put identity on the excluded diagonal
 +              w[static_cast<unsigned int>(row_iter.index1())] = 1.; 
 +          }
 +        }
 +
 +        //line 3:
 +        OutputRowIterator row_iter_out = output.begin1();
 +        for (SparseVectorIterator k = w.begin(); k != w.end(); ++k)
 +        {
 +          unsigned int index_k = k->first;
 +          // Enforce i = 2 and 
 +          if (index_k >= static_cast<unsigned int>(row_iter.index1()))
 +              break;
 +
 +          detail::ilu_inc_row_iterator_to_row_index(row_iter_out, index_k);
 +
 +          //line 3: temp = a_ik = a_ik / a_kk
 +          double temp = k->second / output(index_k, index_k);
 +          if (output(index_k, index_k) == 0.0)
 +          {
 +              std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << index_k << "!" << std::endl;
 +
 +          }
 +
 +          for (OutputColIterator j = row_iter_out.begin(); j != row_iter_out.end(); ++j)
 +          {
 +              // Only fill if it a nonzero element of the input matrix
 +              if (input(row_iter.index1(), j.index2())) {
 +                  // Follow standard ILU algorithm (i.e., for j = k+1, ... , N)
 +                  if (j.index2() > index_k) 
 +                  {
 +                      // set a_ij
 +                      w[j.index2()] -= temp * *j;
 +                  }
 +              }
 +          }
 +          // Set a_ik
 +          w[index_k] = temp;
 +          
 +        } //for k
 +
 +        // Write rows back to LU factor output
 +        unsigned int k_count = 0; 
 +        for (SparseVectorIterator k = w.begin(); k != w.end(); ++k )
 +        {
 +          output(static_cast<unsigned int>(row_iter.index1()), k->first) = static_cast<typename LUType::value_type>(w[k->first]);
 +          k_count ++; 
 +        }
 +      } //for i
 +    }
 +
 +
 +    /** @brief ILUT preconditioner class, can be supplied to solve()-routines
++=======
+         ilu0_tag(bool with_level_scheduling = false) : use_level_scheduling_(with_level_scheduling) {}
+ 
+         bool use_level_scheduling() const { return use_level_scheduling_; }
+         void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+ 
+       private:
+         bool use_level_scheduling_;
+     };
+ 
+ 
+     /** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+       *
+       * refer to the Algorithm in Saad's book (1996 edition)
+       *
+       *  @param A       The sparse matrix matrix. The result is directly written to A.
+       */
+     template<typename ScalarType>
+     void precondition(viennacl::compressed_matrix<ScalarType> & A, ilu0_tag const & /* tag */)
+     {
+       assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+       assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+       assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ 
+       ScalarType         * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+       unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+       unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+ 
+       // Note: Line numbers in the following refer to the algorithm in Saad's book
+ 
+       for (vcl_size_t i=1; i<A.size1(); ++i)  // Line 1
+       {
+         unsigned int row_i_begin = row_buffer[i];
+         unsigned int row_i_end   = row_buffer[i+1];
+         for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) //Note: We do not assume that the column indices within a row are sorted
+         {
+           unsigned int k = col_buffer[buf_index_k];
+           if (k >= i)
+             continue; //Note: We do not assume that the column indices within a row are sorted
+ 
+           unsigned int row_k_begin = row_buffer[k];
+           unsigned int row_k_end   = row_buffer[k+1];
+ 
+           // get a_kk:
+           ScalarType a_kk = 0;
+           for (unsigned int buf_index_akk = row_k_begin; buf_index_akk < row_k_end; ++buf_index_akk)
+           {
+             if (col_buffer[buf_index_akk] == k)
+             {
+               a_kk = elements[buf_index_akk];
+               break;
+             }
+           }
+ 
+           ScalarType & a_ik = elements[buf_index_k];
+           a_ik /= a_kk;                                 //Line 3
+ 
+           for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) //Note: We do not assume that the column indices within a row are sorted
+           {
+             unsigned int j = col_buffer[buf_index_j];
+             if (j <= k)
+               continue;
+ 
+             // determine a_kj:
+             ScalarType a_kj = 0;
+             for (unsigned int buf_index_akj = row_k_begin; buf_index_akj < row_k_end; ++buf_index_akj)
+             {
+               if (col_buffer[buf_index_akj] == j)
+               {
+                 a_kk = elements[buf_index_akj];
+                 break;
+               }
+             }
+ 
+             //a_ij -= a_ik * a_kj
+             elements[buf_index_j] -= a_ik * a_kj;  //Line 5
+           }
+         }
+       }
+ 
+     }
+ 
+ 
+     /** @brief ILU0 preconditioner class, can be supplied to solve()-routines
++>>>>>>> upstream/1.5.1
      */
      template <typename MatrixType>
      class ilu0_precond
@@@ -166,35 -149,42 +299,72 @@@
          typedef typename MatrixType::value_type      ScalarType;
  
        public:
++<<<<<<< HEAD
 +        ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : _tag(tag), LU(mat.size1())
 +        {
 +            //initialize preconditioner:
 +            //std::cout << "Start CPU precond" << std::endl;
 +            init(mat);          
++=======
+         ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : tag_(tag), LU()
+         {
+             //initialize preconditioner:
+             //std::cout << "Start CPU precond" << std::endl;
+             init(mat);
++>>>>>>> upstream/1.5.1
              //std::cout << "End CPU precond" << std::endl;
          }
  
          template <typename VectorType>
++<<<<<<< HEAD
 +            void apply(VectorType & vec) const
 +            {
 +                viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU);
 +                viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec);
 +            }
++=======
+         void apply(VectorType & vec) const
+         {
+           unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+           unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+           ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU.handle());
+ 
+           viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), unit_lower_tag());
+           viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), upper_tag());
+         }
++>>>>>>> upstream/1.5.1
  
        private:
          void init(MatrixType const & mat)
          {
++<<<<<<< HEAD
 +            viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU);
 +            viennacl::linalg::precondition(mat, LU_adapter, _tag);
 +        }
 +
 +        ilu0_tag const & _tag;
 +        
 +        public: std::vector< std::map<unsigned int, ScalarType> > LU;
 +    };
 +
 +
 +    /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
++=======
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           viennacl::switch_memory_context(LU, host_context);
+ 
+           viennacl::copy(mat, LU);
+           viennacl::linalg::precondition(LU, tag_);
+         }
+ 
+         ilu0_tag const & tag_;
+ 
+         viennacl::compressed_matrix<ScalarType> LU;
+     };
+ 
+ 
+     /** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
++>>>>>>> upstream/1.5.1
        *
        *  Specialization for compressed_matrix
        */
@@@ -203,46 -193,180 +373,222 @@@
      {
          typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
  
++<<<<<<< HEAD
 +        public:
 +        ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : _tag(tag), LU(mat.size1())
 +        {
 +            //initialize preconditioner:
 +            //std::cout << "Start GPU precond" << std::endl;
 +            init(mat);          
 +            //std::cout << "End GPU precond" << std::endl;
++=======
+       public:
+         ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
+         {
+           //initialize preconditioner:
+           //std::cout << "Start GPU precond" << std::endl;
+           init(mat);
+           //std::cout << "End GPU precond" << std::endl;
++>>>>>>> upstream/1.5.1
          }
  
          void apply(vector<ScalarType> & vec) const
          {
++<<<<<<< HEAD
 +            copy(vec, temp_vec);
 +            //lu_substitute(LU, vec);
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU);
 +            viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, temp_vec);
 +
 +            copy(temp_vec, vec);
 +        }
 +
 +        private:
 +        void init(MatrixType const & mat)
 +        {
 +            std::vector< std::map<unsigned int, ScalarType> > temp(mat.size1());
 +            //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
 +
 +            //copy to cpu:
 +            copy(mat, temp);
 +
 +            viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp);
 +            viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU);
 +            viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
 +
 +            temp_vec.resize(mat.size1());
 +
 +        }
 +
 +        ilu0_tag const & _tag;
 +        //MatrixType LU;
 +        public: std::vector< std::map<unsigned int, ScalarType> > LU;
 +        private: mutable std::vector<ScalarType> temp_vec;
++=======
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+           {
+             if (tag_.use_level_scheduling())
+             {
+               //std::cout << "Using multifrontal on GPU..." << std::endl;
+               detail::level_scheduling_substitute(vec,
+                                                   multifrontal_L_row_index_arrays_,
+                                                   multifrontal_L_row_buffers_,
+                                                   multifrontal_L_col_buffers_,
+                                                   multifrontal_L_element_buffers_,
+                                                   multifrontal_L_row_elimination_num_list_);
+ 
+               vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+ 
+               detail::level_scheduling_substitute(vec,
+                                                   multifrontal_U_row_index_arrays_,
+                                                   multifrontal_U_row_buffers_,
+                                                   multifrontal_U_col_buffers_,
+                                                   multifrontal_U_element_buffers_,
+                                                   multifrontal_U_row_elimination_num_list_);
+             }
+             else
+             {
+               viennacl::context old_context = viennacl::traits::context(vec);
+               viennacl::switch_memory_context(vec, host_context);
+               viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+               viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+               viennacl::switch_memory_context(vec, old_context);
+             }
+           }
+           else //apply ILU0 directly on CPU
+           {
+             if (tag_.use_level_scheduling())
+             {
+               //std::cout << "Using multifrontal..." << std::endl;
+               detail::level_scheduling_substitute(vec,
+                                                   multifrontal_L_row_index_arrays_,
+                                                   multifrontal_L_row_buffers_,
+                                                   multifrontal_L_col_buffers_,
+                                                   multifrontal_L_element_buffers_,
+                                                   multifrontal_L_row_elimination_num_list_);
+ 
+               vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+ 
+               detail::level_scheduling_substitute(vec,
+                                                   multifrontal_U_row_index_arrays_,
+                                                   multifrontal_U_row_buffers_,
+                                                   multifrontal_U_col_buffers_,
+                                                   multifrontal_U_element_buffers_,
+                                                   multifrontal_U_row_elimination_num_list_);
+             }
+             else
+             {
+               viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+               viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+             }
+           }
+         }
+ 
+         vcl_size_t levels() const { return multifrontal_L_row_index_arrays_.size(); }
+ 
+       private:
+         void init(MatrixType const & mat)
+         {
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           viennacl::switch_memory_context(LU, host_context);
+           LU = mat;
+           viennacl::linalg::precondition(LU, tag_);
+ 
+           if (!tag_.use_level_scheduling())
+             return;
+ 
+           // multifrontal part:
+           viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+           multifrontal_U_diagonal_.resize(LU.size1(), false);
+           host_based::detail::row_info(LU, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+ 
+           detail::level_scheduling_setup_L(LU,
+                                            multifrontal_U_diagonal_, //dummy
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+ 
+ 
+           detail::level_scheduling_setup_U(LU,
+                                            multifrontal_U_diagonal_,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+ 
+           //
+           // Bring to device if necessary:
+           //
+ 
+           // L:
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                              it != multifrontal_L_row_index_arrays_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                              it != multifrontal_L_row_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                              it != multifrontal_L_col_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                              it != multifrontal_L_element_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+ 
+ 
+           // U:
+ 
+           viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                              it != multifrontal_U_row_index_arrays_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                              it != multifrontal_U_row_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                              it != multifrontal_U_col_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                              it != multifrontal_U_element_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+ 
+         }
+ 
+         ilu0_tag const & tag_;
+         viennacl::compressed_matrix<ScalarType> LU;
+ 
+         std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+         std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+         std::list< vcl_size_t > multifrontal_L_row_elimination_num_list_;
+ 
+         viennacl::vector<ScalarType> multifrontal_U_diagonal_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+         std::list< vcl_size_t > multifrontal_U_row_elimination_num_list_;
+ 
++>>>>>>> upstream/1.5.1
      };
  
    }
diff --cc viennacl/linalg/detail/ilu/ilut.hpp
index 61e2175,311f0c1..e291583
--- a/viennacl/linalg/detail/ilu/ilut.hpp
+++ b/viennacl/linalg/detail/ilu/ilut.hpp
@@@ -2,16 -2,17 +2,27 @@@
  #define VIENNACL_LINALG_DETAIL_ILUT_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
@@@ -23,10 -24,14 +34,20 @@@
  
  #include <vector>
  #include <cmath>
++<<<<<<< HEAD
++=======
+ #include <iostream>
++>>>>>>> upstream/1.5.1
  #include "viennacl/forwards.h"
  #include "viennacl/tools/tools.hpp"
  
  #include "viennacl/linalg/detail/ilu/common.hpp"
++<<<<<<< HEAD
++=======
+ #include "viennacl/compressed_matrix.hpp"
+ 
+ #include "viennacl/linalg/host_based/common.hpp"
++>>>>>>> upstream/1.5.1
  
  #include <map>
  
@@@ -34,7 -39,7 +55,11 @@@ namespace viennac
  {
    namespace linalg
    {
++<<<<<<< HEAD
 +    
++=======
+ 
++>>>>>>> upstream/1.5.1
      /** @brief A tag for incomplete LU factorization with threshold (ILUT)
      */
      class ilut_tag
@@@ -42,154 -47,198 +67,347 @@@
        public:
          /** @brief The constructor.
          *
++<<<<<<< HEAD
 +        * @param entries_per_row  Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
 +        * @param drop_tolerance   The drop tolerance for ILUT
 +        */
 +        ilut_tag(unsigned int entries_per_row = 20,
 +                 double drop_tolerance = 1e-4) : _entries_per_row(entries_per_row), _drop_tolerance(drop_tolerance) {}; 
++=======
+         * @param entries_per_row        Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
+         * @param drop_tolerance         The drop tolerance for ILUT
+         * @param with_level_scheduling  Flag for enabling level scheduling on GPUs.
+         */
+         ilut_tag(unsigned int entries_per_row = 20,
+                  double drop_tolerance = 1e-4,
+                  bool with_level_scheduling = false) : entries_per_row_(entries_per_row), drop_tolerance_(drop_tolerance), use_level_scheduling_(with_level_scheduling) {}
++>>>>>>> upstream/1.5.1
  
          void set_drop_tolerance(double tol)
          {
            if (tol > 0)
++<<<<<<< HEAD
 +            _drop_tolerance = tol;
 +        }
 +        double get_drop_tolerance() const { return _drop_tolerance; }
 +        
 +        void set_entries_per_row(unsigned int e)
 +        {
 +          if (e > 0)
 +            _entries_per_row = e;
 +        }
 +
 +        unsigned int get_entries_per_row() const { return _entries_per_row; }
 +
 +      private:
 +        unsigned int _entries_per_row;
 +        double _drop_tolerance;
 +    };
 +    
 +        
 +    /** @brief Implementation of a ILU-preconditioner with threshold
 +    *
 +    * refer to Algorithm 10.6 by Saad's book (1996 edition)
 +    *
 +    *  @param input   The input matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns
 +    *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
 +    *  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
 +    */
 +    template<typename MatrixType, typename LUType>
 +    void precondition(MatrixType const & input, LUType & output, ilut_tag const & tag)
 +    {
 +      typedef std::map<unsigned int, double>          SparseVector;
 +      typedef typename SparseVector::iterator         SparseVectorIterator;
 +      typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
 +      typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
 +      typedef typename LUType::iterator1              OutputRowIterator;  //iterate along increasing row index
 +      typedef typename LUType::iterator2              OutputColIterator;  //iterate along increasing column index
 +
 +      output.clear();
 +      assert(input.size1() == output.size1());
 +      assert(input.size2() == output.size2());
 +      output.resize(static_cast<unsigned int>(input.size1()), static_cast<unsigned int>(input.size2()), false);
 +      SparseVector w;
 +      
 +      std::map<double, unsigned int> temp_map;
 +      
 +      for (InputRowIterator row_iter = input.begin1(); row_iter != input.end1(); ++row_iter)
 +      {
 +    /*    if (i%10 == 0)
 +      std::cout << i << std::endl;*/
 +        
 +        //line 2:
 +        w.clear();
 +        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
 +          w[static_cast<unsigned int>(col_iter.index2())] = *col_iter;
 +
 +        //line 3:
 +        OutputRowIterator row_iter_out = output.begin1();
 +        for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); ++w_k)
 +        {
 +          unsigned int k = w_k->first;
 +          if (k >= static_cast<unsigned int>(row_iter.index1()))
 +            break;
 +          
 +          
 +          //while (row_iter_out.index1() < index_k)
 +          //  ++row_iter_out;
 +          //if (row_iter_out.index1() < index_k)
 +          //  row_iter_out += index_k - row_iter_out.index1();
 +          detail::ilu_inc_row_iterator_to_row_index(row_iter_out, k);
 +          
 +          //line 4:
 +          double a_kk = output(k, k);
 +          double temp = w_k->second / a_kk;
 +          if (a_kk == 0.0)
 +          {
 +            std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << k 
 +                      << " while processing line " << row_iter.index1() << "!" << std::endl;
 +          }
 +          
 +          //line 5: (dropping rule to w_k)
 +          if ( fabs(temp) > tag.get_drop_tolerance())
 +          {
 +            //line 7:
 +            for (OutputColIterator u_k = row_iter_out.begin(); u_k != row_iter_out.end(); ++u_k)
 +            {
 +              if (u_k.index2() >= k)
 +                w[u_k.index2()] -= temp * *u_k;
 +            }
 +          }
 +        } //for k
 +        
 +        //Line 10: Apply a dropping rule to w
 +        //Sort entries which are kept
 +        temp_map.clear();
 +        for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); )
 +        {
 +          if ( (fabs(w_k->second) < tag.get_drop_tolerance()) 
 +               && (w_k->first != static_cast<unsigned int>(row_iter.index1())) //do not drop diagonal element!
 +             )
 +          { 
 +            long index = w_k->first;
 +            ++w_k;
 +            w.erase(index);
 +          }
 +          else
 +          {
 +            double temp = fabs(w_k->second);
 +            while (temp_map.find(temp) != temp_map.end())
 +              temp *= 1.00000001; //make entry slightly larger to maintain uniqueness of the entry
 +            temp_map[temp] = w_k->first;
 +            ++w_k;
++=======
+             drop_tolerance_ = tol;
+         }
+         double get_drop_tolerance() const { return drop_tolerance_; }
+ 
+         void set_entries_per_row(unsigned int e)
+         {
+           if (e > 0)
+             entries_per_row_ = e;
+         }
+ 
+         unsigned int get_entries_per_row() const { return entries_per_row_; }
+ 
+         bool use_level_scheduling() const { return use_level_scheduling_; }
+         void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+ 
+       private:
+         unsigned int entries_per_row_;
+         double drop_tolerance_;
+         bool use_level_scheduling_;
+     };
+ 
+ 
+     /** @brief Dispatcher overload for extracting the row of nonzeros of a compressed matrix */
+     template <typename ScalarType, typename SizeType, typename SparseVector>
+     ScalarType setup_w(viennacl::compressed_matrix<ScalarType> const & A,
+                        SizeType row,
+                        SparseVector & w)
+     {
+       assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+       assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+       assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ 
+       ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+       unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+       unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+ 
+       SizeType row_i_begin = static_cast<SizeType>(row_buffer[row]);
+       SizeType row_i_end   = static_cast<SizeType>(row_buffer[row+1]);
+       ScalarType row_norm = 0;
+       for (SizeType buf_index_i = row_i_begin; buf_index_i < row_i_end; ++buf_index_i) //Note: We do not assume that the column indices within a row are sorted
+       {
+         ScalarType entry = elements[buf_index_i];
+         w[col_buffer[buf_index_i]] = entry;
+         row_norm += entry * entry;
+       }
+       return std::sqrt(row_norm);
+     }
+ 
+     /** @brief Dispatcher overload for extracting the row of nonzeros of a STL-grown sparse matrix */
+     template <typename ScalarType, typename SizeType, typename SparseVector>
+     ScalarType setup_w(std::vector< std::map<SizeType, ScalarType> > const & A,
+                        SizeType row,
+                        SparseVector & w)
+     {
+       ScalarType row_norm = 0;
+       w = A[row];
+       for (typename std::map<SizeType, ScalarType>::const_iterator iter_w  = w.begin(); iter_w != w.end(); ++iter_w)
+         row_norm += iter_w->second * iter_w->second;
+ 
+       return std::sqrt(row_norm);
+     }
+ 
+ 
+     /** @brief Implementation of a ILU-preconditioner with threshold. Optimized implementation for compressed_matrix.
+     *
+     * refer to Algorithm 10.6 by Saad's book (1996 edition)
+     *
+     *  @param A       The input matrix. Either a compressed_matrix or of type std::vector< std::map<T, U> >
+     *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
+     *  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
+     */
+     template<typename SparseMatrixType, typename ScalarType, typename SizeType>
+     void precondition(SparseMatrixType const & A,
+                       std::vector< std::map<SizeType, ScalarType> > & output,
+                       ilut_tag const & tag)
+     {
+       typedef std::map<SizeType, ScalarType>          SparseVector;
+       typedef typename SparseVector::iterator         SparseVectorIterator;
+       typedef typename std::map<SizeType, ScalarType>::const_iterator   OutputRowConstIterator;
+       typedef std::multimap<ScalarType, std::pair<SizeType, ScalarType> >  TemporarySortMap;
+ 
+       assert(viennacl::traits::size1(A) == output.size() && bool("Output matrix size mismatch") );
+ 
+       SparseVector w;
+       TemporarySortMap temp_map;
+ 
+       for (SizeType i=0; i<viennacl::traits::size1(A); ++i)  // Line 1
+       {
+     /*    if (i%10 == 0)
+       std::cout << i << std::endl;*/
+ 
+         //line 2: set up w
+         ScalarType row_norm = setup_w(A, i, w);
+         ScalarType tau_i = static_cast<ScalarType>(tag.get_drop_tolerance()) * row_norm;
+ 
+         //line 3:
+         for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); ++w_k)
+         {
+           SizeType k = w_k->first;
+           if (k >= i)
+             break;
+ 
+           //line 4:
+           ScalarType a_kk = output[k][k];
+           if (a_kk == 0)
+           {
+             std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << k
+                       << " while processing line " << i << "!" << std::endl;
+             throw "ILUT zero diagonal!";
+           }
+ 
+           ScalarType w_k_entry = w_k->second / a_kk;
+           w_k->second = w_k_entry;
+ 
+           //line 5: (dropping rule to w_k)
+           if ( std::fabs(w_k_entry) > tau_i)
+           {
+             //line 7:
+             for (OutputRowConstIterator u_k = output[k].begin(); u_k != output[k].end(); ++u_k)
+             {
+               if (u_k->first > k)
+                 w[u_k->first] -= w_k_entry * u_k->second;
+             }
+           }
+           //else
+           //  w.erase(k);
+ 
+         } //for w_k
+ 
+         //Line 10: Apply a dropping rule to w
+         //Sort entries which are kept
+         temp_map.clear();
+         for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); ++w_k)
+         {
+           SizeType k = w_k->first;
+           ScalarType w_k_entry = w_k->second;
+ 
+           ScalarType abs_w_k = std::fabs(w_k_entry);
+           if ( (abs_w_k > tau_i) || (k == i) )//do not drop diagonal element!
+           {
+ 
+             if (abs_w_k == 0) // this can only happen for diagonal entry
+               throw "Triangular factor in ILUT singular!";
+ 
+             temp_map.insert(std::make_pair(abs_w_k, std::make_pair(k, w_k_entry)));
++>>>>>>> upstream/1.5.1
            }
          }
  
          //Lines 10-12: write the largest p values to L and U
++<<<<<<< HEAD
 +        unsigned int written_L = 0;
 +        unsigned int written_U = 0;
 +        for (typename std::map<double, unsigned int>::reverse_iterator iter = temp_map.rbegin(); iter != temp_map.rend(); ++iter)
 +        {
 +          if (iter->second > static_cast<unsigned int>(row_iter.index1())) //entry for U
 +          {
 +            if (written_U < tag.get_entries_per_row())
 +            {
 +              output(static_cast<unsigned int>(row_iter.index1()), iter->second) = static_cast<typename LUType::value_type>(w[iter->second]);
 +              ++written_U;
 +            }
 +          }
 +          else if (iter->second == static_cast<unsigned int>(row_iter.index1()))
 +          {
 +            output(iter->second, iter->second) = static_cast<typename LUType::value_type>(w[static_cast<unsigned int>(row_iter.index1())]);
 +          }
 +          else //entry for L
 +          {
 +            if (written_L < tag.get_entries_per_row())
 +            {
 +              output(static_cast<unsigned int>(row_iter.index1()), iter->second) = static_cast<typename LUType::value_type>(w[iter->second]);
 +              ++written_L;
 +            }
 +          }
 +        }
++=======
+         SizeType written_L = 0;
+         SizeType written_U = 0;
+         for (typename TemporarySortMap::reverse_iterator iter = temp_map.rbegin(); iter != temp_map.rend(); ++iter)
+         {
+           std::map<SizeType, ScalarType> & row_i = output[i];
+           SizeType j = (iter->second).first;
+           ScalarType w_j_entry = (iter->second).second;
+ 
+           if (j < i) // Line 11: entry for L
+           {
+             if (written_L < tag.get_entries_per_row())
+             {
+               row_i[j] = w_j_entry;
+               ++written_L;
+             }
+           }
+           else if (j == i)  // Diagonal entry is always kept
+           {
+             row_i[j] = w_j_entry;
+           }
+           else //Line 12: entry for U
+           {
+             if (written_U < tag.get_entries_per_row())
+             {
+               row_i[j] = w_j_entry;
+               ++written_U;
+             }
+           }
+         }
+ 
+         w.clear(); //Line 13
+ 
++>>>>>>> upstream/1.5.1
        } //for i
      }
  
@@@ -200,35 -249,50 +418,82 @@@
      class ilut_precond
      {
        typedef typename MatrixType::value_type      ScalarType;
++<<<<<<< HEAD
 +      
 +      public:
 +        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : _tag(tag), LU(mat.size1())
 +        {
 +          //initialize preconditioner:
 +          //std::cout << "Start CPU precond" << std::endl;
 +          init(mat);          
 +          //std::cout << "End CPU precond" << std::endl;
 +        }
 +        
 +        template <typename VectorType>
 +        void apply(VectorType & vec) const
 +        {
 +          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
 +          viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec);
 +        }
 +        
 +      private:
 +        void init(MatrixType const & mat)
 +        {
 +          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
 +          viennacl::linalg::precondition(mat, LU_adapter, _tag);
 +        }
 +        
 +        ilut_tag const & _tag;
 +        std::vector< std::map<unsigned int, ScalarType> > LU;
 +    };
 +
 +    
++=======
+ 
+       public:
+         ilut_precond(MatrixType const & mat, ilut_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
+         {
+           //initialize preconditioner:
+           //std::cout << "Start CPU precond" << std::endl;
+           init(mat);
+           //std::cout << "End CPU precond" << std::endl;
+         }
+ 
+         template <typename VectorType>
+         void apply(VectorType & vec) const
+         {
+           //Note: Since vec can be a rather arbitrary vector type, we call the more generic version in the backend manually:
+           unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+           unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+           ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU.handle());
+ 
+           viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), unit_lower_tag());
+           viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), upper_tag());
+         }
+ 
+       private:
+         void init(MatrixType const & mat)
+         {
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           viennacl::compressed_matrix<ScalarType> temp;
+           viennacl::switch_memory_context(temp, host_context);
+ 
+           viennacl::copy(mat, temp);
+ 
+           std::vector< std::map<unsigned int, ScalarType> > LU_temp(mat.size1());
+ 
+           viennacl::linalg::precondition(temp, LU_temp, tag_);
+ 
+           viennacl::switch_memory_context(LU, host_context);
+           viennacl::copy(LU_temp, LU);
+         }
+ 
+         ilut_tag const & tag_;
+         viennacl::compressed_matrix<ScalarType> LU;
+     };
+ 
+ 
++>>>>>>> upstream/1.5.1
      /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
      *
      *  Specialization for compressed_matrix
@@@ -237,49 -301,177 +502,223 @@@
      class ilut_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
      {
        typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
++<<<<<<< HEAD
 +      
 +      public:
 +        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : _tag(tag), LU(mat.size1())
 +        {
 +          //initialize preconditioner:
 +          //std::cout << "Start GPU precond" << std::endl;
 +          init(mat);          
 +          //std::cout << "End GPU precond" << std::endl;
 +        }
 +        
 +        void apply(vector<ScalarType> & vec) const
 +        {
 +          copy(vec, temp_vec);
 +          //lu_substitute(LU, vec);
 +          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
 +          viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, temp_vec);
 +          
 +          copy(temp_vec, vec);
 +        }
 +        
 +      private:
 +        void init(MatrixType const & mat)
 +        {
 +          std::vector< std::map<unsigned int, ScalarType> > temp(mat.size1());
 +          //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
 +
 +          //copy to cpu:
 +          copy(mat, temp);
 +          
 +          viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp, temp.size(), temp.size());
 +          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
 +          viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
 +          
 +          temp_vec.resize(mat.size1());
 +          
 +          //copy resulting preconditioner back to gpu:
 +          //copy(LU_cpu, LU);
 +        }
 +        
 +        ilut_tag const & _tag;
 +        //MatrixType LU;
 +        std::vector< std::map<unsigned int, ScalarType> > LU;
 +        mutable std::vector<ScalarType> temp_vec;
++=======
+ 
+       public:
+         ilut_precond(MatrixType const & mat, ilut_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
+         {
+           //initialize preconditioner:
+           //std::cout << "Start GPU precond" << std::endl;
+           init(mat);
+           //std::cout << "End GPU precond" << std::endl;
+         }
+ 
+         void apply(vector<ScalarType> & vec) const
+         {
+           if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+           {
+             if (tag_.use_level_scheduling())
+             {
+               //std::cout << "Using multifrontal on GPU..." << std::endl;
+               detail::level_scheduling_substitute(vec,
+                                                   multifrontal_L_row_index_arrays_,
+                                                   multifrontal_L_row_buffers_,
+                                                   multifrontal_L_col_buffers_,
+                                                   multifrontal_L_element_buffers_,
+                                                   multifrontal_L_row_elimination_num_list_);
+ 
+               vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+ 
+               detail::level_scheduling_substitute(vec,
+                                                   multifrontal_U_row_index_arrays_,
+                                                   multifrontal_U_row_buffers_,
+                                                   multifrontal_U_col_buffers_,
+                                                   multifrontal_U_element_buffers_,
+                                                   multifrontal_U_row_elimination_num_list_);
+             }
+             else
+             {
+               viennacl::context host_context(viennacl::MAIN_MEMORY);
+               viennacl::context old_context = viennacl::traits::context(vec);
+               viennacl::switch_memory_context(vec, host_context);
+               viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+               viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+               viennacl::switch_memory_context(vec, old_context);
+             }
+           }
+           else //apply ILUT directly:
+           {
+             viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+             viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+           }
+         }
+ 
+       private:
+         void init(MatrixType const & mat)
+         {
+           viennacl::context host_context(viennacl::MAIN_MEMORY);
+           viennacl::switch_memory_context(LU, host_context);
+ 
+           std::vector< std::map<unsigned int, ScalarType> > LU_temp(mat.size1());
+ 
+           if (viennacl::traits::context(mat).memory_type() == viennacl::MAIN_MEMORY)
+           {
+             viennacl::linalg::precondition(mat, LU_temp, tag_);
+           }
+           else //we need to copy to CPU
+           {
+             viennacl::compressed_matrix<ScalarType> cpu_mat(mat.size1(), mat.size2());
+             viennacl::switch_memory_context(cpu_mat, host_context);
+ 
+             cpu_mat = mat;
+ 
+             viennacl::linalg::precondition(cpu_mat, LU_temp, tag_);
+           }
+ 
+           viennacl::copy(LU_temp, LU);
+ 
+           if (!tag_.use_level_scheduling())
+             return;
+ 
+           //
+           // multifrontal part:
+           //
+ 
+           viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+           multifrontal_U_diagonal_.resize(LU.size1(), false);
+           host_based::detail::row_info(LU, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+ 
+           detail::level_scheduling_setup_L(LU,
+                                            multifrontal_U_diagonal_, //dummy
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+ 
+ 
+           detail::level_scheduling_setup_U(LU,
+                                            multifrontal_U_diagonal_,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+ 
+           //
+           // Bring to device if necessary:
+           //
+ 
+           // L:
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                              it != multifrontal_L_row_index_arrays_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                              it != multifrontal_L_row_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                              it != multifrontal_L_col_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                              it != multifrontal_L_element_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+ 
+ 
+           // U:
+ 
+           viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                              it != multifrontal_U_row_index_arrays_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                              it != multifrontal_U_row_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                              it != multifrontal_U_col_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+ 
+           for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                              it != multifrontal_U_element_buffers_.end();
+                                                                            ++it)
+             viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+ 
+ 
+         }
+ 
+         ilut_tag const & tag_;
+         viennacl::compressed_matrix<ScalarType> LU;
+ 
+         std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+         std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+         std::list< vcl_size_t > multifrontal_L_row_elimination_num_list_;
+ 
+         viennacl::vector<ScalarType> multifrontal_U_diagonal_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+         std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+         std::list< vcl_size_t > multifrontal_U_row_elimination_num_list_;
++>>>>>>> upstream/1.5.1
      };
  
    }
diff --cc viennacl/linalg/eig.hpp
index 6ba8e6f,8479f94..98ce0c0
--- a/viennacl/linalg/eig.hpp
+++ b/viennacl/linalg/eig.hpp
@@@ -2,22 -2,23 +2,37 @@@
  #define VIENNACL_LINALG_EIG_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2011, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
 +/** @file eig.hpp
++=======
+ /** @file viennacl/linalg/eig.hpp
++>>>>>>> upstream/1.5.1
  *   @brief Convenience header file including all available eigenvalue algorithms
  */
  
@@@ -25,4 -26,4 +40,8 @@@
  #include "viennacl/linalg/lanczos.hpp"
  #include "viennacl/linalg/power_iter.hpp"
  
- #endif
++<<<<<<< HEAD
++#endif
++=======
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/linalg/gmres.hpp
index 35a7a05,7768763..bc14c8a
--- a/viennacl/linalg/gmres.hpp
+++ b/viennacl/linalg/gmres.hpp
@@@ -132,38 -177,29 +177,42 @@@ namespace viennac
        unsigned int krylov_dim = tag.krylov_dim();
        if (problem_size < tag.krylov_dim())
          krylov_dim = problem_size; //A Krylov space larger than the matrix would lead to seg-faults (mathematically, error is certain to be zero already)
-       
-       VectorType res(problem_size);
-       VectorType v_k_tilde(problem_size);
-       VectorType v_k_tilde_temp(problem_size);
-       
-       std::vector< std::vector<CPU_ScalarType> > R(krylov_dim);
+ 
+       VectorType res = rhs;
+       VectorType v_k_tilde = rhs;
+       VectorType v_k_tilde_temp = rhs;
+ 
+       std::vector< std::vector<CPU_ScalarType> > R(krylov_dim, std::vector<CPU_ScalarType>(tag.krylov_dim()));
        std::vector<CPU_ScalarType> projection_rhs(krylov_dim);
-       std::vector<VectorType> U(krylov_dim);
  
-       const CPU_ScalarType gpu_scalar_minus_1 = static_cast<CPU_ScalarType>(-1);    //representing the scalar '-1' on the GPU. Prevents blocking write operations
-       const CPU_ScalarType gpu_scalar_1 = static_cast<CPU_ScalarType>(1);    //representing the scalar '1' on the GPU. Prevents blocking write operations
-       const CPU_ScalarType gpu_scalar_2 = static_cast<CPU_ScalarType>(2);    //representing the scalar '2' on the GPU. Prevents blocking write operations
-       
+       std::vector<VectorType>      householder_reflectors(krylov_dim, rhs);
+       std::vector<CPU_ScalarType>  betas(krylov_dim);
+ 
        CPU_ScalarType norm_rhs = viennacl::linalg::norm_2(rhs);
++<<<<<<< HEAD
 +      
 +      if (norm_rhs == 0) //solution is zero if RHS norm is zero
 +        return result;
 +      
 +      unsigned int k;
 +      for (k = 0; k < krylov_dim; ++k)
 +      {
 +        R[k].resize(tag.krylov_dim()); 
 +        viennacl::traits::resize(U[k], problem_size);
 +      }
++=======
++>>>>>>> upstream/1.5.1
+ 
+       if (norm_rhs == 0) //solution is zero if RHS norm is zero
+         return result;
  
-       //std::cout << "Starting GMRES..." << std::endl;
        tag.iters(0);
-       
+ 
        for (unsigned int it = 0; it <= tag.max_restarts(); ++it)
        {
-         //std::cout << "-- GMRES Start " << it << " -- " << std::endl;
-         
+         //
+         // (Re-)Initialize residual: r = b - A*x (without temporary for the result of A*x)
+         //
          res = rhs;
          res -= viennacl::linalg::prod(matrix, result);  //initial guess zero
          precond.apply(res);
diff --cc viennacl/linalg/ilu.hpp
index 91e8c18,f913649..99d67c0
--- a/viennacl/linalg/ilu.hpp
+++ b/viennacl/linalg/ilu.hpp
@@@ -17,7 -18,7 +18,11 @@@
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
 +/** @file ilu.hpp
++=======
+ /** @file viennacl/linalg/ilu.hpp
++>>>>>>> upstream/1.5.1
      @brief Implementations of incomplete factorization preconditioners. Convenience header file.
  */
  
diff --cc viennacl/linalg/inner_prod.hpp
index 042c594,ed810db..f1b922a
--- a/viennacl/linalg/inner_prod.hpp
+++ b/viennacl/linalg/inner_prod.hpp
@@@ -139,42 -100,74 +100,102 @@@ namespace viennac
      // ----------------------------------------------------
      // VIENNACL
      //
-     template< typename ScalarType, unsigned int alignment1, unsigned int alignment2 >
-     viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment1>, 
-                                  const viennacl::vector<ScalarType, alignment2>,
+     template< typename NumericT>
+     viennacl::scalar_expression< const vector_base<NumericT>, const vector_base<NumericT>, viennacl::op_inner_prod >
+     inner_prod(vector_base<NumericT> const & vector1,
+                vector_base<NumericT> const & vector2)
+     {
+       //std::cout << "viennacl .. " << std::endl;
+       return viennacl::scalar_expression< const vector_base<NumericT>,
+                                           const vector_base<NumericT>,
+                                           viennacl::op_inner_prod >(vector1, vector2);
+     }
+ 
+ 
+     // expression on lhs:
+     template< typename LHS, typename RHS, typename OP, typename NumericT>
+     viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                                  const vector_base<NumericT>,
                                   viennacl::op_inner_prod >
-     inner_prod(viennacl::vector<ScalarType, alignment1> const & vector1, viennacl::vector<ScalarType, alignment2> const & vector2, 
-          typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< viennacl::vector<ScalarType, alignment1> >::type >::value
-                                             >::type* dummy = 0)
+     inner_prod(viennacl::vector_expression<LHS, RHS, OP> const & vector1,
+                vector_base<NumericT> const & vector2)
      {
        //std::cout << "viennacl .. " << std::endl;
-       return viennacl::linalg::inner_prod_impl(vector1, vector2);
+       return viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                                           const vector_base<NumericT>,
+                                           viennacl::op_inner_prod >(vector1, vector2);
      }
++<<<<<<< HEAD
 +    
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                 const viennacl::vector_range<VectorType>,
 +                                 viennacl::op_inner_prod >
 +    inner_prod(viennacl::vector_range<VectorType> const & vector1,
 +               viennacl::vector_range<VectorType> const & vector2)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                          const viennacl::vector_range<VectorType>,
 +                                          viennacl::op_inner_prod >(vector1, vector2);
 +    }
 +
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                 const viennacl::vector_slice<VectorType>,
 +                                 viennacl::op_inner_prod >
 +    inner_prod(viennacl::vector_slice<VectorType> const & vector1,
 +               viennacl::vector_slice<VectorType> const & vector2)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                          const viennacl::vector_slice<VectorType>,
 +                                          viennacl::op_inner_prod >(vector1, vector2);
 +    }
 +
++=======
+ 
+     // expression on rhs:
+     template <typename NumericT, typename LHS, typename RHS, typename OP>
+     viennacl::scalar_expression< const vector_base<NumericT>,
+                                  const viennacl::vector_expression<LHS, RHS, OP>,
+                                  viennacl::op_inner_prod >
+     inner_prod(vector_base<NumericT> const & vector1,
+                viennacl::vector_expression<LHS, RHS, OP> const & vector2)
+     {
+       //std::cout << "viennacl .. " << std::endl;
+       return viennacl::scalar_expression< const vector_base<NumericT>,
+                                           const viennacl::vector_expression<LHS, RHS, OP>,
+                                           viennacl::op_inner_prod >(vector1, vector2);
+     }
+ 
+     // expression on lhs and rhs:
+     template <typename LHS1, typename RHS1, typename OP1,
+               typename LHS2, typename RHS2, typename OP2>
+     viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                                  const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                                  viennacl::op_inner_prod >
+     inner_prod(viennacl::vector_expression<LHS1, RHS1, OP1> const & vector1,
+                viennacl::vector_expression<LHS2, RHS2, OP2> const & vector2)
+     {
+       //std::cout << "viennacl .. " << std::endl;
+       return viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                                           const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                                           viennacl::op_inner_prod >(vector1, vector2);
+     }
+ 
+ 
+     // Multiple inner products:
+     template< typename NumericT>
+     viennacl::vector_expression< const vector_base<NumericT>, const vector_tuple<NumericT>, viennacl::op_inner_prod >
+     inner_prod(vector_base<NumericT> const & x,
+                vector_tuple<NumericT> const & y_tuple)
+     {
+       return viennacl::vector_expression< const vector_base<NumericT>,
+                                           const vector_tuple<NumericT>,
+                                           viennacl::op_inner_prod >(x, y_tuple);
+     }
+ 
+ 
++>>>>>>> upstream/1.5.1
    } // end namespace linalg
  } // end namespace viennacl
  #endif
diff --cc viennacl/linalg/jacobi_precond.hpp
index 4afb2d7,bc268d9..8bb5de3
--- a/viennacl/linalg/jacobi_precond.hpp
+++ b/viennacl/linalg/jacobi_precond.hpp
@@@ -154,27 -121,12 +121,31 @@@ namespace viennac
          template <unsigned int ALIGNMENT>
          void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
          {
++<<<<<<< HEAD
 +          assert(viennacl::traits::size1(system_matrix) == viennacl::traits::size(vec));
 +          
 +          //run kernel:
 +          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
 +                                                                "diag_precond");
 +
 +          viennacl::ocl::enqueue(
 +             k(viennacl::traits::handle(diag_A_inv),
 +                cl_uint(viennacl::traits::start(diag_A_inv)),
 +                cl_uint(viennacl::traits::stride(diag_A_inv)),
 +                cl_uint(viennacl::traits::size(diag_A_inv)),
 +               viennacl::traits::handle(vec),
 +                cl_uint(viennacl::traits::start(vec)),
 +                cl_uint(viennacl::traits::stride(vec)),
 +                cl_uint(viennacl::traits::size(vec)) )
 +                                );        
++=======
+           assert(viennacl::traits::size(diag_A) == viennacl::traits::size(vec) && bool("Size mismatch"));
+           vec = element_div(vec, diag_A);
++>>>>>>> upstream/1.5.1
          }
-         
+ 
        private:
-         MatrixType const & system_matrix;
-         viennacl::vector<ScalarType> diag_A_inv;
+         viennacl::vector<ScalarType> diag_A;
      };
  
    }
diff --cc viennacl/linalg/lanczos.hpp
index bf5560a,2785435..4379f64
--- a/viennacl/linalg/lanczos.hpp
+++ b/viennacl/linalg/lanczos.hpp
@@@ -2,16 -2,17 +2,27 @@@
  #define VIENNACL_LINALG_LANCZOS_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
@@@ -19,11 -20,11 +30,19 @@@
  
  /** @file viennacl/linalg/lanczos.hpp
  *   @brief Generic interface for the Lanczos algorithm.
++<<<<<<< HEAD
 +* 
 +*   Contributed by Guenther Mader and Astrid Rupp.
 +*/
 +
 +#include <math.h>    //for sqrt()
++=======
+ *
+ *   Contributed by Guenther Mader and Astrid Rupp.
+ */
+ 
+ #include <cmath>
++>>>>>>> upstream/1.5.1
  #include <vector>
  #include "viennacl/vector.hpp"
  #include "viennacl/compressed_matrix.hpp"
@@@ -39,21 -40,21 +58,37 @@@
  #include <boost/numeric/ublas/matrix_expression.hpp>
  #include <boost/numeric/ublas/matrix_sparse.hpp>
  #include <boost/numeric/ublas/vector.hpp>
++<<<<<<< HEAD
 +#include <boost/numeric/ublas/operation.hpp> 
++=======
+ #include <boost/numeric/ublas/operation.hpp>
++>>>>>>> upstream/1.5.1
  #include <boost/numeric/ublas/vector_expression.hpp>
  #include <boost/numeric/ublas/io.hpp>
  
  namespace viennacl
  {
++<<<<<<< HEAD
 +  namespace linalg 
 +  {
 +    
 +    /** @brief A tag for the lanczos algorithm. 
 +    */
 +    class lanczos_tag 
 +    {
 +      public:
 +        
++=======
+   namespace linalg
+   {
+ 
+     /** @brief A tag for the lanczos algorithm.
+     */
+     class lanczos_tag
+     {
+       public:
+ 
++>>>>>>> upstream/1.5.1
          enum
          {
            partial_reorthogonalization = 0,
@@@ -64,57 -65,57 +99,106 @@@
          /** @brief The constructor
          *
          * @param factor                 Exponent of epsilon - tolerance for batches of Reorthogonalization
++<<<<<<< HEAD
 +        * @param num_eigenvalues        Number of eigenvalues to be returned
 +        * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
 +        * @param krylov_size            Maximal krylov-space size
 +        */
 +
 +        lanczos_tag(double factor = 0.75,
 +                    std::size_t numeig = 10,
 +                    int met = 0,
 +                    std::size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {};
++=======
+         * @param numeig                 Number of eigenvalues to be returned
+         * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
+         * @param krylov                 Maximum krylov-space size
+         */
+ 
+         lanczos_tag(double factor = 0.75,
+                     vcl_size_t numeig = 10,
+                     int met = 0,
+                     vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {}
++>>>>>>> upstream/1.5.1
  
          /** @brief Sets the number of eigenvalues */
          void num_eigenvalues(int numeig){ num_eigenvalues_ = numeig; }
  
            /** @brief Returns the number of eigenvalues */
++<<<<<<< HEAD
 +        std::size_t num_eigenvalues() const { return num_eigenvalues_; }
++=======
+         vcl_size_t num_eigenvalues() const { return num_eigenvalues_; }
++>>>>>>> upstream/1.5.1
  
            /** @brief Sets the exponent of epsilon */
          void factor(double fct) { factor_ = fct; }
  
          /** @brief Returns the exponent */
          double factor() const { return factor_; }
++<<<<<<< HEAD
 +        
 +        /** @brief Sets the size of the kylov space */
 +        void krylov_size(int max) { krylov_size_ = max; }
 +
 +        /** @brief Returns the size of the kylov space */  
 +        std::size_t  krylov_size() const { return krylov_size_; }
 +
 +        /** @brief Sets the reorthogonalization method */ 
 +        void method(int met){ method_ = met; }
 +        
 +        /** @brief Returns the reorthogonalization method */ 
 +        int method() const { return method_; }
 +
 +
 +      private: 
 +        double factor_;
 +        std::size_t num_eigenvalues_;
 +        int method_; // see enum defined above for possible values
 +        std::size_t krylov_size_;
 +
 +    };
 +    
 +    
 +    namespace detail
 +    {
 +      /** 
 +      *   @brief Implementation of the Lanczos PRO algorithm
 +      *   
 +      *   @param A            The system matrix
 +      *   @param r            Random start vector 
++=======
+ 
+         /** @brief Sets the size of the kylov space */
+         void krylov_size(int max) { krylov_size_ = max; }
+ 
+         /** @brief Returns the size of the kylov space */
+         vcl_size_t  krylov_size() const { return krylov_size_; }
+ 
+         /** @brief Sets the reorthogonalization method */
+         void method(int met){ method_ = met; }
+ 
+         /** @brief Returns the reorthogonalization method */
+         int method() const { return method_; }
+ 
+ 
+       private:
+         double factor_;
+         vcl_size_t num_eigenvalues_;
+         int method_; // see enum defined above for possible values
+         vcl_size_t krylov_size_;
+ 
+     };
+ 
+ 
+     namespace detail
+     {
+       /**
+       *   @brief Implementation of the Lanczos PRO algorithm
+       *
+       *   @param A            The system matrix
+       *   @param r            Random start vector
++>>>>>>> upstream/1.5.1
        *   @param size         Size of krylov-space
        *   @param tag          Lanczos_tag with several options for the algorithm
        *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
@@@ -124,29 -125,29 +208,55 @@@
        std::vector<
                typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
                >
++<<<<<<< HEAD
 +      lanczosPRO (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
 +      {
 +    
 +        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
 +        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 +
 +        
 +        // generation of some random numbers, used for lanczos PRO algorithm
 +        boost::mt11213b mt;
 +        boost::normal_distribution<double> N(0, 1);
 +        boost::bernoulli_distribution<double> B(0.5);
 +        boost::triangle_distribution<double> T(-1, 0, 1);
 +
 +        boost::variate_generator<boost::mt11213b&, boost::normal_distribution<double> >     get_N(mt, N);
 +        boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<double> >  get_B(mt, B);
 +        boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<double> >   get_T(mt, T);
 +
 +        
 +        long i, j, k, index, retry, reorths;
 +        std::vector<long> l_bound(size/2), u_bound(size/2);
 +        bool second_step;
 +        double squ_eps, eta, temp, eps, retry_th;
 +        long n = r.size();
++=======
+       lanczosPRO (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag const & tag)
+       {
+ 
+         typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
+         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
+ 
+         // generation of some random numbers, used for lanczos PRO algorithm
+         boost::mt11213b mt;
+         boost::normal_distribution<CPU_ScalarType> N(0, 1);
+         boost::bernoulli_distribution<CPU_ScalarType> B(0.5);
+         boost::triangle_distribution<CPU_ScalarType> T(-1, 0, 1);
+ 
+         boost::variate_generator<boost::mt11213b&, boost::normal_distribution<CPU_ScalarType> >     get_N(mt, N);
+         boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<CPU_ScalarType> >  get_B(mt, B);
+         boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<CPU_ScalarType> >   get_T(mt, T);
+ 
+ 
+         long i, j, k, index, retry, reorths;
+         std::vector<long> l_bound(size/2), u_bound(size/2);
+         bool second_step;
+         CPU_ScalarType squ_eps, eta, temp, eps, retry_th;
+         vcl_size_t n = r.size();
++>>>>>>> upstream/1.5.1
          std::vector< std::vector<CPU_ScalarType> > w(2, std::vector<CPU_ScalarType>(size));
          CPU_ScalarType cpu_beta;
  
@@@ -160,28 -161,28 +270,51 @@@
          boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
  
          second_step = false;
++<<<<<<< HEAD
 +        eps = std::numeric_limits<double>::epsilon();
 +        squ_eps = sqrt(eps);
 +        retry_th = 1e-2;
 +        eta =  exp(log(eps) * tag.factor());
 +        reorths = 0;
 +        retry = 0;
 +        
 +        vcl_beta = viennacl::linalg::norm_2(r);
 +        
 +        r /= vcl_beta;
 +        
 +        detail::copy_vec_to_vec(r,s);
 +        boost::numeric::ublas::column(Q, 0) = s;
 +        
++=======
+         eps = std::numeric_limits<CPU_ScalarType>::epsilon();
+         squ_eps = std::sqrt(eps);
+         retry_th = 1e-2;
+         eta = std::exp(std::log(eps) * tag.factor());
+         reorths = 0;
+         retry = 0;
+ 
+         vcl_beta = viennacl::linalg::norm_2(r);
+ 
+         r /= vcl_beta;
+ 
+         detail::copy_vec_to_vec(r,s);
+         boost::numeric::ublas::column(Q, 0) = s;
+ 
++>>>>>>> upstream/1.5.1
          VectorT u = viennacl::linalg::prod(A, r);
          vcl_alpha = viennacl::linalg::inner_prod(u, r);
          alphas.push_back(vcl_alpha);
          w[0][0] = 1;
          betas.push_back(vcl_beta);
++<<<<<<< HEAD
 +        
 +        long batches = 0;
 +        for(i = 1;i < size; i++)
++=======
+ 
+         long batches = 0;
+         for(i = 1;i < static_cast<long>(size); i++)
++>>>>>>> upstream/1.5.1
          {
            r = u - vcl_alpha * r;
            vcl_beta = viennacl::linalg::norm_2(r);
@@@ -193,10 -194,10 +326,17 @@@
            w[index][i] = 1;
            k = (i + 1) % 2;
            w[index][0] = (betas[1] * w[k][1] + (alphas[0] - vcl_alpha) * w[k][0] - betas[i - 1] * w[index][0]) / vcl_beta + eps * 0.3 * get_N() * (betas[1] + vcl_beta);
++<<<<<<< HEAD
 +          
 +          for(j = 1;j < i - 1;j++)
 +          {
 +                  w[index][j] = (betas[j + 1] * w[k][j + 1] + (alphas[j] - vcl_alpha) * w[k][j] + betas[j] * w[k][j - 1] - betas[i - 1] * w[index][j]) / vcl_beta + eps * 0.3 * get_N() * (betas[j + 1] + vcl_beta);      
++=======
+ 
+           for(j = 1;j < i - 1;j++)
+           {
+                   w[index][j] = (betas[j + 1] * w[k][j + 1] + (alphas[j] - vcl_alpha) * w[k][j] + betas[j] * w[k][j - 1] - betas[i - 1] * w[index][j]) / vcl_beta + eps * 0.3 * get_N() * (betas[j + 1] + vcl_beta);
++>>>>>>> upstream/1.5.1
            }
            w[index][i - 1] = 0.6 * eps * n * get_N() * betas[1] / vcl_beta;
  
@@@ -211,7 -212,7 +351,11 @@@
                {
                  detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
                  inner_rt = viennacl::linalg::inner_prod(r,t);
++<<<<<<< HEAD
 +                r = r - inner_rt * t;   
++=======
+                 r = r - inner_rt * t;
++>>>>>>> upstream/1.5.1
                  w[index][k] = 1.5 * eps * get_N();
                  reorths++;
                }
@@@ -224,8 -225,8 +368,13 @@@
            batches = 0;
  
            for(j = 0;j < i;j++)
++<<<<<<< HEAD
 +          { 
 +            if(fabs(w[index][j]) >= squ_eps)
++=======
+           {
+             if(std::fabs(w[index][j]) >= squ_eps)
++>>>>>>> upstream/1.5.1
              {
                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, j), t);
                inner_rt = viennacl::linalg::inner_prod(r,t);
@@@ -233,7 -234,7 +382,11 @@@
                w[index][j] = 1.5 * eps * get_N();
                k = j - 1;
                reorths++;
++<<<<<<< HEAD
 +              while(k >= 0 && fabs(w[index][k]) > eta)
++=======
+               while(k >= 0 && std::fabs(w[index][k]) > eta)
++>>>>>>> upstream/1.5.1
                {
                  detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
                  inner_rt = viennacl::linalg::inner_prod(r,t);
@@@ -244,12 -245,12 +397,21 @@@
                }
                l_bound[batches] = k + 1;
                k = j + 1;
++<<<<<<< HEAD
 +              
 +              while(k < i && fabs(w[index][k]) > eta)
 +              {
 +                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
 +                inner_rt = viennacl::linalg::inner_prod(r,t);
 +                r = r - inner_rt * t;   
++=======
+ 
+               while(k < i && std::fabs(w[index][k]) > eta)
+               {
+                 detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
+                 inner_rt = viennacl::linalg::inner_prod(r,t);
+                 r = r - inner_rt * t;
++>>>>>>> upstream/1.5.1
                  w[index][k] = 1.5 * eps * get_N();
                  k++;
                  reorths++;
@@@ -259,7 -260,7 +421,11 @@@
                j = k;
              }
            }
++<<<<<<< HEAD
 +          
++=======
+ 
++>>>>>>> upstream/1.5.1
            if(batches > 0)
            {
              temp = viennacl::linalg::norm_2(r);
@@@ -282,7 -283,7 +448,11 @@@
                vcl_beta = vcl_beta * temp;
              }
            }
++<<<<<<< HEAD
 +      
++=======
+ 
++>>>>>>> upstream/1.5.1
            detail::copy_vec_to_vec(r,s);
            boost::numeric::ublas::column(Q, i) = s;
  
@@@ -295,35 -296,33 +465,61 @@@
          }
  
          return bisect(alphas, betas);
++<<<<<<< HEAD
 +      
 +      }
 +
 +
 +      /** 
 +      *   @brief Implementation of the lanczos algorithm without reorthogonalization
 +      * 
 +      *   @param A            The system matrix
 +      *   @param r            Random start vector 
 +      *   @param size         Size of krylov-space
 +      *   @param tag          Lanczos_tag with several options for the algorithm
++=======
+ 
+       }
+ 
+ 
+       /**
+       *   @brief Implementation of the lanczos algorithm without reorthogonalization
+       *
+       *   @param A            The system matrix
+       *   @param r            Random start vector
+       *   @param size         Size of krylov-space
++>>>>>>> upstream/1.5.1
        *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
        */
        template< typename MatrixT, typename VectorT >
        std::vector<
                typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
                >
++<<<<<<< HEAD
 +      lanczos (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
 +      {
 +      
 +        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
 +        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 +
 +        long i;
++=======
+       lanczos (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag)
+       {
+ 
+         typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
+         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
++>>>>>>> upstream/1.5.1
          ScalarType vcl_beta;
          ScalarType vcl_alpha;
          std::vector<CPU_ScalarType> alphas, betas;
          CPU_ScalarType norm;
++<<<<<<< HEAD
 +        long n = r.size();
++=======
+         vcl_size_t n = r.size();
++>>>>>>> upstream/1.5.1
          VectorT u(n), t(n);
          boost::numeric::ublas::vector<CPU_ScalarType> s(r.size()), u_zero(n), q(n);
          boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
@@@ -331,8 -330,8 +527,13 @@@
          u_zero = boost::numeric::ublas::zero_vector<CPU_ScalarType>(n);
          detail::copy_vec_to_vec(u_zero, u);
          norm = norm_2(r);
++<<<<<<< HEAD
 +        
 +        for(i = 0;i < size; i++)
++=======
+ 
+         for(vcl_size_t i = 0;i < size; i++)
++>>>>>>> upstream/1.5.1
          {
            r /= norm;
            vcl_beta = norm;
@@@ -357,45 -356,44 +558,79 @@@
          return bisect(alphas, betas);
        }
  
++<<<<<<< HEAD
 +      /** 
 +      *   @brief Implementation of the Lanczos FRO algorithm
 +      *   
 +      *   @param A            The system matrix 
 +      *   @param r            Random start vector 
 +      *   @param size         Size of krylov-space
 +      *   @param tag          Lanczos_tag with several options for the algorithm
++=======
+       /**
+       *   @brief Implementation of the Lanczos FRO algorithm
+       *
+       *   @param A            The system matrix
+       *   @param r            Random start vector
+       *   @param size         Size of krylov-space
++>>>>>>> upstream/1.5.1
        *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
        */
        template< typename MatrixT, typename VectorT >
        std::vector<
                typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
                >
++<<<<<<< HEAD
 +      lanczosFRO (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
 +      {
 +        
 +        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
 +        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 +        
++=======
+       lanczosFRO (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag)
+       {
+ 
+         typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
+         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
++>>>>>>> upstream/1.5.1
            CPU_ScalarType temp;
            CPU_ScalarType norm;
            ScalarType vcl_beta;
            ScalarType vcl_alpha;
            std::vector<CPU_ScalarType> alphas, betas;
++<<<<<<< HEAD
 +          long n = r.size();
++=======
+           vcl_size_t n = r.size();
++>>>>>>> upstream/1.5.1
            VectorT u(n), t(n);
            ScalarType inner_rt;
            boost::numeric::ublas::vector<CPU_ScalarType> u_zero(n), s(r.size()), q(n);
            boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
++<<<<<<< HEAD
 +          
++=======
+ 
++>>>>>>> upstream/1.5.1
            long reorths = 0;
            norm = norm_2(r);
  
  
++<<<<<<< HEAD
 +          for(long i = 0; i < size; i++)
 +          {
 +            r /= norm;
 +
 +            for(long j = 0; j < i; j++)
++=======
+           for(vcl_size_t i = 0; i < size; i++)
+           {
+             r /= norm;
+ 
+             for(vcl_size_t j = 0; j < i; j++)
++>>>>>>> upstream/1.5.1
              {
                q = boost::numeric::ublas::column(Q, j);
                detail::copy_vec_to_vec(q, t);
@@@ -419,15 -417,15 +654,27 @@@
              alphas.push_back(vcl_alpha);
              betas.push_back(vcl_beta);
            }
++<<<<<<< HEAD
 +          
 +          return bisect(alphas, betas);
 +      }
 +
 +    } // end namespace detail    
 +
 +    /** 
 +    *   @brief Implementation of the calculation of eigenvalues using lanczos
 +    *   
++=======
+ 
+           return bisect(alphas, betas);
+       }
+ 
+     } // end namespace detail
+ 
+     /**
+     *   @brief Implementation of the calculation of eigenvalues using lanczos
+     *
++>>>>>>> upstream/1.5.1
      *   @param matrix        The system matrix
      *   @param tag           Tag with several options for the lanczos algorithm
      *   @return              Returns the n largest eigenvalues (n defined in the lanczos_tag)
@@@ -439,29 -437,29 +686,55 @@@
        typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
        typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
++<<<<<<< HEAD
 +    
 +      boost::mt11213b mt;
 +      boost::normal_distribution<double> N(0, 1);
 +      boost::bernoulli_distribution<double> B(0.5);
 +      boost::triangle_distribution<double> T(-1, 0, 1);
 +
 +      boost::variate_generator<boost::mt11213b&, boost::normal_distribution<double> >     get_N(mt, N);
 +      boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<double> >  get_B(mt, B);
 +      boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<double> >   get_T(mt, T);
 +      
 +      std::vector<CPU_ScalarType> eigenvalues;
 +      std::size_t matrix_size = matrix.size1();
 +      VectorT r(matrix_size);
 +      std::vector<CPU_ScalarType> s(matrix_size);
 +      
 +      for(std::size_t i=0; i<s.size(); ++i)
 +        s[i] = 3.0 * get_B() + get_T() - 1.5; 
 +
 +      detail::copy_vec_to_vec(s,r);
 +
 +      std::size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
 +                                                                  : tag.krylov_size();
 +      
++=======
+ 
+       boost::mt11213b mt;
+       boost::normal_distribution<CPU_ScalarType> N(0, 1);
+       boost::bernoulli_distribution<CPU_ScalarType> B(0.5);
+       boost::triangle_distribution<CPU_ScalarType> T(-1, 0, 1);
+ 
+       boost::variate_generator<boost::mt11213b&, boost::normal_distribution<CPU_ScalarType> >     get_N(mt, N);
+       boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<CPU_ScalarType> >  get_B(mt, B);
+       boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<CPU_ScalarType> >   get_T(mt, T);
+ 
+       std::vector<CPU_ScalarType> eigenvalues;
+       vcl_size_t matrix_size = matrix.size1();
+       VectorT r(matrix_size);
+       std::vector<CPU_ScalarType> s(matrix_size);
+ 
+       for(vcl_size_t i=0; i<s.size(); ++i)
+         s[i] = 3.0 * get_B() + get_T() - 1.5;
+ 
+       detail::copy_vec_to_vec(s,r);
+ 
+       vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
+                                                                   : tag.krylov_size();
+ 
++>>>>>>> upstream/1.5.1
        switch(tag.method())
        {
          case lanczos_tag::partial_reorthogonalization:
@@@ -472,21 -470,21 +745,41 @@@
            break;
          case lanczos_tag::no_reorthogonalization:
            eigenvalues = detail::lanczos(matrix, r, size_krylov, tag);
++<<<<<<< HEAD
 +          break;                
++=======
+           break;
++>>>>>>> upstream/1.5.1
        }
  
        std::vector<CPU_ScalarType> largest_eigenvalues;
  
++<<<<<<< HEAD
 +      for(std::size_t i = 1; i<=tag.num_eigenvalues(); i++)
 +        largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
 +    
 +    
 +      return largest_eigenvalues;
 +    }
 +    
 +    
 +
 +    
 +  } // end namespace linalg
 +} // end namespace viennacl
- #endif
++#endif
++=======
+       for(vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++)
+         largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
+ 
+ 
+       return largest_eigenvalues;
+     }
+ 
+ 
+ 
+ 
+   } // end namespace linalg
+ } // end namespace viennacl
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/linalg/matrix_operations.hpp
index 1dcc167,5ca490e..5607ace
--- a/viennacl/linalg/matrix_operations.hpp
+++ b/viennacl/linalg/matrix_operations.hpp
@@@ -36,192 -34,117 +34,252 @@@
  #include "viennacl/traits/start.hpp"
  #include "viennacl/traits/handle.hpp"
  #include "viennacl/traits/stride.hpp"
++<<<<<<< HEAD
 +#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
 +#include "viennacl/tools/matrix_prod_kernel_class_deducer.hpp"
 +#include "viennacl/linalg/kernels/vector_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_row_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_col_kernels.h"
 +
 +#include "viennacl/linalg/kernels/matrix_prod_col_col_col_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_prod_col_col_row_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_prod_col_row_col_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_prod_col_row_row_kernels.h"
 +
 +#include "viennacl/linalg/kernels/matrix_prod_row_col_col_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_prod_row_col_row_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_prod_row_row_col_kernels.h"
 +#include "viennacl/linalg/kernels/matrix_prod_row_row_row_kernels.h"
++=======
+ #include "viennacl/vector.hpp"
+ #include "viennacl/linalg/host_based/matrix_operations.hpp"
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/opencl/matrix_operations.hpp"
+ #endif
+ 
+ #ifdef VIENNACL_WITH_CUDA
+   #include "viennacl/linalg/cuda/matrix_operations.hpp"
+ #endif
++>>>>>>> upstream/1.5.1
  
  namespace viennacl
  {
    namespace linalg
    {
++<<<<<<< HEAD
 +    
 +    /** @brief Assign a matrix (-range/-slice) to another matrix (-range/slice).
 +    *
 +    * Computes mat1 = mat2.
 +    * 
 +    * @param mat1  The destination matrix
 +    * @param mat2  The source matrix
 +    */
 +    template <typename M1, typename M2>
 +    typename viennacl::enable_if< viennacl::is_matrix<M1>::value
 +                                  && viennacl::is_matrix<M2>::value
 +                                >::type
 +    assign(M1       & mat1,
 +           M2 const & mat2)
 +    {
 +      typedef typename viennacl::result_of::cpu_value_type<M1>::type        value_type;
 +      
 +      assert( (viennacl::traits::size1(mat1) == viennacl::traits::size1(mat2))
 +             && (viennacl::traits::size2(mat1) == viennacl::traits::size2(mat2))
 +             && "Incompatible matrix sizes in assign()!");
 +      
 +      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< M1 >::ResultType    KernelClass;
 +      
 +      
 +      std::size_t block_size = 16;
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "assign");
 +      k.global_work_size(0, block_size*block_size);
 +      k.global_work_size(1, block_size*block_size);
 +      k.local_work_size(0, block_size);
 +      k.local_work_size(1, block_size);
 +
 +        viennacl::ocl::enqueue(k(viennacl::traits::handle(mat1), 
 +                                        cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)), 
 +                                        cl_uint(viennacl::traits::stride1(mat1)),             cl_uint(viennacl::traits::stride2(mat1)),
 +                                        cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
 +                                        cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
 +                                 viennacl::traits::handle(mat2), 
 +                                        cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)), 
 +                                        cl_uint(viennacl::traits::stride1(mat2)),             cl_uint(viennacl::traits::stride2(mat2)),
 +                                        cl_uint(viennacl::traits::size1(mat2)),            cl_uint(viennacl::traits::size2(mat2)),
 +                                        cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2))
 +                                )
 +                              );
 +    }
 +    
 +    
 +    //
 +    ///////////////////////////////////// addition and subtraction///////////////////////////////////////////////
 +    //
 +    
 +    namespace detail
++=======
+ 
+     template <typename NumericT, typename F,
+               typename ScalarType1>
+     void am(matrix_base<NumericT, F> & mat1,
+             matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
++>>>>>>> upstream/1.5.1
      {
-       template<class T1, class T2, class T3>
-       typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
-                                    && viennacl::is_matrix<T2>::value 
-                                    && viennacl::is_matrix<T3>::value >::type
-       add_sub_impl(const T1 & mat1, 
-                    const T2 & mat2,
-                          T3 & result,
-                    std::string kernel_name
-                   )
+       switch (viennacl::traits::handle(mat1).get_active_handle_id())
        {
++<<<<<<< HEAD
 +        assert(result.size1() == mat1.size1());
 +        assert(result.size2() == mat1.size2());
 +        assert(result.size1() == mat2.size1());
 +        assert(result.size2() == mat2.size2());
 +
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
 +        
 +        std::size_t block_size = 16;
 +        
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
 +        k.global_work_size(0, block_size*block_size);
 +        k.global_work_size(1, block_size*block_size);
 +        k.local_work_size(0, block_size);
 +        k.local_work_size(1, block_size);
 +        viennacl::ocl::enqueue(k(viennacl::traits::handle(mat1), 
 +                                        cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)), 
 +                                        cl_uint(viennacl::traits::stride1(mat1)),             cl_uint(viennacl::traits::stride2(mat1)),
 +                                        cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
 +                                        cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
 +                                viennacl::traits::handle(mat2), 
 +                                        cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)), 
 +                                        cl_uint(viennacl::traits::stride1(mat2)),             cl_uint(viennacl::traits::stride2(mat2)),
 +                                        cl_uint(viennacl::traits::size1(mat2)),            cl_uint(viennacl::traits::size2(mat2)),
 +                                        cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
 +                                viennacl::traits::handle(result), 
 +                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
 +                                        cl_uint(viennacl::traits::stride1(result)),           cl_uint(viennacl::traits::stride2(result)),
 +                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
 +                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
 +                                )
 +                              );        
++=======
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
++>>>>>>> upstream/1.5.1
        }
-       
+     }
  
  
-       template <typename T1, typename T2>
-       typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                     && viennacl::is_matrix<T2>::value
-                                   >::type
-       inplace_add_sub_impl(T1 & result, T2 const & mat2, std::string kernel_name)
+     template <typename NumericT, typename F,
+               typename ScalarType1, typename ScalarType2>
+     void ambm(matrix_base<NumericT, F> & mat1,
+               matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+               matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+     {
+       switch (viennacl::traits::handle(mat1).get_active_handle_id())
        {
++<<<<<<< HEAD
 +        assert(viennacl::traits::size1(result) == viennacl::traits::size1(mat2));
 +        assert(viennacl::traits::size2(result) == viennacl::traits::size2(mat2));
 +
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
 +        
 +        std::size_t block_size = 16;
 +        
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
 +        k.global_work_size(0, block_size*block_size);
 +        k.global_work_size(1, block_size*block_size);
 +        k.local_work_size(0, block_size);
 +        k.local_work_size(1, block_size);
 +        
 +        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
 +                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
 +                                        cl_uint(viennacl::traits::stride1(result)),           cl_uint(viennacl::traits::stride2(result)),
 +                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
 +                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
 +                                viennacl::traits::handle(mat2), 
 +                                        cl_uint(viennacl::traits::start1(mat2)),            cl_uint(viennacl::traits::start2(mat2)), 
 +                                        cl_uint(viennacl::traits::stride1(mat2)),              cl_uint(viennacl::traits::stride2(mat2)),
 +                                        cl_uint(viennacl::traits::size1(mat2)),             cl_uint(viennacl::traits::size2(mat2)),
 +                                        cl_uint(viennacl::traits::internal_size1(mat2)),    cl_uint(viennacl::traits::internal_size2(mat2))
 +                                )
 +                              );
++=======
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::ambm(mat1,
+                                              mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                              mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::ambm(mat1,
+                                          mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                          mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::ambm(mat1,
+                                        mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                        mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
++>>>>>>> upstream/1.5.1
        }
-       
-     }
-     
-     /** @brief Adds two dense matrices or submatrices and writes the result to a third matrix or submatrix
-     *
-     * This is the implementation of the convenience expression result = mat1 + mat2;
-     *
-     * @param mat1   The left hand side operand
-     * @param mat2   The right hand side operand
-     * @param result The resulting matrix
-     */
-     template<class T1, class T2, class T3>
-     typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
-                                  && viennacl::is_matrix<T2>::value 
-                                  && viennacl::is_matrix<T3>::value >::type
-     add(const T1 & mat1, 
-         const T2 & mat2,
-               T3 & result)
-     {
-       detail::add_sub_impl(mat1, mat2, result, "add");
      }
  
-     /** @brief Adds a dense matrix or submatrix to another
-     *
-     * This is the implementation of the convenience expression result += mat1;
-     *
-     * @param mat2   The addend (either a matrix or a matrix_range)
-     * @param result The resulting matrix  (either a matrix or a matrix_range)
-     */
-     template <typename T1, typename T2>
-     typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                   && viennacl::is_matrix<T2>::value
-                                 >::type
-     inplace_add(T1 & result, T2 const & mat2)
+ 
+     template <typename NumericT, typename F,
+               typename ScalarType1, typename ScalarType2>
+     void ambm_m(matrix_base<NumericT, F> & mat1,
+                 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
      {
-       detail::inplace_add_sub_impl(result, mat2, "inplace_add");
+       switch (viennacl::traits::handle(mat1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::ambm_m(mat1,
+                                                mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::ambm_m(mat1,
+                                            mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                            mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::ambm_m(mat1,
+                                          mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                          mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
      }
  
  
@@@ -262,38 -200,29 +335,50 @@@
      }
  
  
- 
- 
-     //
-     /////////////////////////   inplace multiplication and division /////////////////////////////////
-     //
- 
-     namespace detail
+     /** @brief Dispatcher interface for A = diag(v, k) */
+     template <typename NumericT, typename F>
+     void matrix_diag_from_vector(const vector_base<NumericT> & v, int k, matrix_base<NumericT, F> & A)
      {
-       template <typename  T1, typename ScalarType>
-       typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
-       inplace_mult_div_impl(T1 & result, 
-                             ScalarType val,
-                             std::string kernel_name)
+       switch (viennacl::traits::handle(v).get_active_handle_id())
        {
++<<<<<<< HEAD
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
 +        
 +        std::size_t block_size = 16;
 +          
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
 +        
 +        k.global_work_size(0, block_size*block_size);
 +        k.global_work_size(1, block_size*block_size);
 +        k.local_work_size(0, block_size);
 +        k.local_work_size(1, block_size);
 +        
 +        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
 +                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
 +                                        cl_uint(viennacl::traits::stride1(result)),           cl_uint(viennacl::traits::stride2(result)),
 +                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
 +                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
 +                                val)
 +                              );
++=======
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::matrix_diag_from_vector(v, k, A);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::matrix_diag_from_vector(v, k, A);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::matrix_diag_from_vector(v, k, A);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
++>>>>>>> upstream/1.5.1
        }
      }
  
@@@ -355,24 -337,6 +493,27 @@@
  
  
      // A * x
++<<<<<<< HEAD
 +    /** @brief Returns a proxy class that represents matrix-vector multiplication
 +    *
 +    * This is used for the convenience expression result = prod(mat, vec);
 +    *
 +    * @param mat    The matrix
 +    * @param vec    The vector
 +    */
 +    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
 +    viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, 
 +                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
 +    {
 +      return viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                         const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
 +                                         op_prod >(mat, vec);
 +    }
 +
++=======
++>>>>>>> upstream/1.5.1
  
      /** @brief Carries out matrix-vector multiplication
      *
@@@ -382,265 -346,119 +523,325 @@@
      * @param vec    The vector
      * @param result The result vector
      */
++<<<<<<< HEAD
 +    template <typename MatrixType, typename VectorType1, typename VectorType2>
 +    typename viennacl::enable_if<   viennacl::is_matrix<MatrixType>::value 
 +                                  && viennacl::is_vector<VectorType1>::value 
 +                                  && viennacl::is_vector<VectorType2>::value >::type
 +    prod_impl(const MatrixType & mat, 
 +              const VectorType1 & vec, 
 +                    VectorType2 & result)
 +    {
 +      assert(mat.size2() == vec.size());
 +      // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
 +      assert(viennacl::traits::handle(vec).get() != viennacl::traits::handle(result).get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
 +      //result.resize(mat.size1());
 +
 +      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "vec_mul");
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(mat),
 +                               cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)), 
 +                               cl_uint(viennacl::traits::stride1(mat)),           cl_uint(viennacl::traits::stride2(mat)),
 +                               cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
 +                               cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
 +                               viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)), 
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result))
 +                             ) );
 +    }
++=======
+     template <typename NumericT, typename F>
+     void prod_impl(const matrix_base<NumericT, F> & mat,
+                    const vector_base<NumericT> & vec,
+                          vector_base<NumericT> & result)
+     {
+       assert( (viennacl::traits::size1(mat) == viennacl::traits::size(result)) && bool("Size check failed at v1 = prod(A, v2): size1(A) != size(v1)"));
+       assert( (viennacl::traits::size2(mat) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = prod(A, v2): size2(A) != size(v2)"));
++>>>>>>> upstream/1.5.1
  
+       switch (viennacl::traits::handle(mat).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::prod_impl(mat, vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::prod_impl(mat, vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::prod_impl(mat, vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
+     }
  
 -
      // trans(A) * x
-     /** @brief Returns a proxy class that represents matrix-vector multiplication with a transposed matrix
-     *
-     * This is used for the convenience expression result = trans(mat) * vec;
-     *
-     * @param proxy  The transposed matrix proxy
-     * @param vec    The vector
-     */
-     template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-     viennacl::vector_expression<const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                    const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                    op_trans>,
-                                 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                 op_prod > prod_impl(const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                                        const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                                        op_trans> & proxy, 
-                                                     const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-     {
-       return viennacl::vector_expression<const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                             const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                             op_trans>,
-                                          const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                          op_prod >(proxy, vec);
-     }
  
-     /** @brief Unwraps the transposed matrix proxy and forwards to trans_prod_impl()
-     */
-     template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-     void prod_impl(const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                       const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                       op_trans> & mat,
-                     const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                           viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
-     {
-       trans_prod_impl(mat.lhs(), vec, result);
-     }
-     
      /** @brief Carries out matrix-vector multiplication with a transposed matrix
      *
      * Implementation of the convenience expression result = trans(mat) * vec;
      *
-     * @param mat    The matrix
-     * @param vec    The vector
-     * @param result The result vector
+     * @param mat_trans  The transposed matrix proxy
+     * @param vec        The vector
+     * @param result     The result vector
      */
-     template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-     void trans_prod_impl(const matrix<SCALARTYPE, F, ALIGNMENT> & mat,
-                           const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                 viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
+     template <typename NumericT, typename F>
+     void prod_impl(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                    const vector_base<NumericT> & vec,
+                          vector_base<NumericT> & result)
      {
++<<<<<<< HEAD
 +      assert(mat.size1() == vec.size());  //remember: mat is transposed!
 +      // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
 +      assert(vec.handle().get() != result.handle().get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
 +      result.resize(mat.size2());
 +
 +      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "trans_vec_mul");
 +      
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(mat),
 +                               cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)), 
 +                               cl_uint(viennacl::traits::stride1(mat)),           cl_uint(viennacl::traits::stride2(mat)),
 +                               cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
 +                               cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
 +                               viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)), 
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result))
 +                             ) );
 +    }
 +
 +
++=======
+       assert( (viennacl::traits::size1(mat_trans.lhs()) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = trans(A) * v2: size1(A) != size(v2)"));
+       assert( (viennacl::traits::size2(mat_trans.lhs()) == viennacl::traits::size(result)) && bool("Size check failed at v1 = trans(A) * v2: size2(A) != size(v1)"));
++>>>>>>> upstream/1.5.1
  
+       switch (viennacl::traits::handle(mat_trans.lhs()).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::prod_impl(mat_trans, vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::prod_impl(mat_trans, vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::prod_impl(mat_trans, vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
+     }
  
  
      //
      /////////////////////////   matrix-matrix products /////////////////////////////////
      //
++<<<<<<< HEAD
 +    
 +    namespace detail
 +    {
 +      // C = A * B and possibly transposed variants
 +      template <typename T1, typename T2, typename T3 >
 +      void prod_slow_kernel(const T1 & A, 
 +                            const T2 & B, 
 +                            T3 & C,
 +                            std::string kernel_name)
 +      {
 +        typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
 +        
 +        typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
 +        KernelClass::init();
 +        
 +        //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
 +        
 +        k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(C), 16));
 +        k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(C), 16));
 +        k.local_work_size(0, 16);
 +        k.local_work_size(1, 16);
 +        
 +        cpu_value_type alpha(1);
 +        cpu_value_type beta(0);
 +        
 +        viennacl::ocl::enqueue(k(alpha,
 +                                 viennacl::traits::handle(A), 
 +                                        cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)), 
 +                                        cl_uint(viennacl::traits::stride1(A)),             cl_uint(viennacl::traits::stride2(A)),
 +                                        cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
 +                                        cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
 +                                 viennacl::traits::handle(B), 
 +                                        cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)), 
 +                                        cl_uint(viennacl::traits::stride1(B)),             cl_uint(viennacl::traits::stride2(B)),
 +                                        cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
 +                                        cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
 +                                 beta,
 +                                 viennacl::traits::handle(C), 
 +                                        cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)), 
 +                                        cl_uint(viennacl::traits::stride1(C)),             cl_uint(viennacl::traits::stride2(C)),
 +                                        cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
 +                                        cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
 +                                )
 +                              );        
 +      }
 +      
 +      // C = A * B, using fast kernel
 +      template <typename T1, typename T2, typename T3 >
 +      void prod_fast_kernel(const T1 & A, 
 +                            const T2 & B, 
 +                            T3 & C,
 +                            std::string kernel_name)
 +      {
 +        typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
 +        
 +        typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
 +        KernelClass::init();
 +        
 +        //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
 +        
 +        k.global_work_size(0, viennacl::traits::size2(C) / 4); //column blocks
 +        k.global_work_size(1, viennacl::traits::size1(C) / 4); //row blocks
 +        k.local_work_size(0, 16);  //columns
 +        k.local_work_size(1, 4);   //rows
 +        
 +        cpu_value_type alpha(1);
 +        cpu_value_type beta(0);
 +        
 +        viennacl::ocl::enqueue(k(alpha,
 +                                 viennacl::traits::handle(A), 
 +                                        cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)), 
 +                                        cl_uint(viennacl::traits::stride1(A)),             cl_uint(viennacl::traits::stride2(A)),
 +                                        cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
 +                                        cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
 +                                 viennacl::traits::handle(B), 
 +                                        cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)), 
 +                                        cl_uint(viennacl::traits::stride1(B)),             cl_uint(viennacl::traits::stride2(B)),
 +                                        cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
 +                                        cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
 +                                 beta,
 +                                 viennacl::traits::handle(C), 
 +                                        cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)), 
 +                                        cl_uint(viennacl::traits::stride1(C)),             cl_uint(viennacl::traits::stride2(C)),
 +                                        cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
 +                                        cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
 +                                )
 +                              );        
 +      }
 +      
 +      template <typename T1, typename T2, typename T3 >
 +      void prod(const T1 & A, 
 +                const T2 & B, 
 +                T3 & C,
 +                std::string fast_kernel_name,
 +                std::string slow_kernel_name)
 +      {
 +        if (   (viennacl::traits::size1(A) < 64)
 +            || (viennacl::traits::size2(A) < 64)
 +            || (viennacl::traits::size1(B) < 64) )   //there is most likely not enough to compute, rendering kernel launch overhead considerable
 +        {
 +          prod_slow_kernel(A, B, C, slow_kernel_name);
 +        }
 +        else if (   (viennacl::traits::size1(A) % 64 == 0)
 +                 && (viennacl::traits::size2(A) % 64 == 0)
 +                 && (viennacl::traits::size1(B) % 64 == 0) )   // allows the use of the fast kernel only
 +        {
 +          prod_fast_kernel(A, B, C, fast_kernel_name);
 +          //prod_slow_kernel(A, B, C, slow_kernel_name);
 +        }
 +        else //TODO: use four kernels
 +        {
 +          prod_slow_kernel(A, B, C, slow_kernel_name);
 +        }
 +        
 +      }
 +    }
 +
++=======
++>>>>>>> upstream/1.5.1
  
      /** @brief Carries out matrix-matrix multiplication
      *
      * Implementation of C = prod(A, B);
      *
      */
++<<<<<<< HEAD
 +    template <typename T1, typename T2, typename T3 >
 +    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
 +                                  && viennacl::is_matrix<T2>::value
 +                                  && viennacl::is_matrix<T3>::value
 +                                >::type
 +    prod_impl(const T1 & A, 
 +              const T2 & B, 
 +                    T3 & C)
 +    {
 +      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
 +      assert(viennacl::traits::size2(A) == viennacl::traits::size1(B));
 +      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
 +      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
 +      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
 +            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
 +            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
 +
 +      
 +      detail::prod(A, B, C, "prod16_AA", "prod_AA");
++=======
+     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+     void prod_impl(const matrix_base<NumericT, F1> & A,
+                    const matrix_base<NumericT, F2> & B,
+                          matrix_base<NumericT, F3> & C,
+                    ScalarType alpha,
+                    ScalarType beta)
+     {
+       assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size check failed at C = prod(A, B): size1(A) != size1(C)"));
+       assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size check failed at C = prod(A, B): size2(A) != size1(B)"));
+       assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size check failed at C = prod(A, B): size2(B) != size2(C)"));
+ 
+ 
+       switch (viennacl::traits::handle(A).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
@@@ -650,28 -468,39 +851,64 @@@
      * Implementation of C = prod(trans(A), B);
      *
      */
++<<<<<<< HEAD
 +    template <typename T1, typename T2, typename T3 >
 +    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
 +                                  && viennacl::is_matrix<T2>::value
 +                                  && viennacl::is_matrix<T3>::value
 +                                >::type
 +    prod_impl(const viennacl::matrix_expression< const T1,
 +                                                 const T1,
 +                                                 op_trans> & A, 
 +              const T2 & B, 
 +                    T3 & C)
 +    {
 +      //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
 +      //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
 +      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
 +      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B));
 +      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
 +      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
 +      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
 +            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
 +            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
 +      
 +      detail::prod(A.lhs(), B, C, "prod16_TA", "prod_TA");
++=======
+     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+     void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
+                                                       const matrix_base<NumericT, F1>,
+                                                       op_trans> & A,
+                    const matrix_base<NumericT, F2> & B,
+                          matrix_base<NumericT, F3> & C,
+                    ScalarType alpha,
+                    ScalarType beta)
+     {
+       assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size check failed at C = prod(trans(A), B): size2(A) != size1(C)"));
+       assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B) && bool("Size check failed at C = prod(trans(A), B): size1(A) != size1(B)"));
+       assert(viennacl::traits::size2(B)       == viennacl::traits::size2(C) && bool("Size check failed at C = prod(trans(A), B): size2(B) != size2(C)"));
+ 
+       switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
@@@ -682,26 -511,37 +919,60 @@@
      * Implementation of C = prod(A, trans(B));
      *
      */
++<<<<<<< HEAD
 +    template <typename T1, typename T2, typename T3 >
 +    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
 +                                  && viennacl::is_matrix<T2>::value
 +                                  && viennacl::is_matrix<T3>::value
 +                                >::type
 +    prod_impl(const T1 & A, 
 +              const viennacl::matrix_expression< const T2,
 +                                                 const T2,
 +                                                 op_trans> & B,
 +              T3 & C)
 +    {
 +      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
 +      assert(viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs()));
 +      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
 +      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
 +      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
 +            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
 +            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
 +      
 +      detail::prod(A, B.lhs(), C, "prod16_AT", "prod_AT");
++=======
+     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+     void prod_impl(const matrix_base<NumericT, F1> & A,
+                    const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                          matrix_base<NumericT, F3> & C,
+                    ScalarType alpha,
+                    ScalarType beta)
+     {
+       assert(viennacl::traits::size1(A)       == viennacl::traits::size1(C)       && bool("Size check failed at C = prod(A, trans(B)): size1(A) != size1(C)"));
+       assert(viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(A, trans(B)): size2(A) != size2(B)"));
+       assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size check failed at C = prod(A, trans(B)): size1(B) != size2(C)"));
+ 
+       switch (viennacl::traits::handle(A).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
@@@ -711,28 -551,37 +982,62 @@@
      * Implementation of C = prod(trans(A), trans(B));
      *
      */
++<<<<<<< HEAD
 +    template <typename T1, typename T2, typename T3 >
 +    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
 +                                  && viennacl::is_matrix<T2>::value
 +                                  && viennacl::is_matrix<T3>::value
 +                                >::type
 +    prod_impl(const viennacl::matrix_expression< const T1,
 +                                                 const T1,
 +                                                 op_trans> & A,
 +              const viennacl::matrix_expression< const T2,
 +                                                 const T2,
 +                                                 op_trans> & B,
 +              T3 & C)
 +    {
 +      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
 +      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()));
 +      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
 +      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
 +      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
 +            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
 +            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
 +      
 +      detail::prod(A.lhs(), B.lhs(), C, "prod16_TT", "prod_TT");
++=======
+     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+     void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
+                    const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                    matrix_base<NumericT, F3> & C,
+                    ScalarType alpha,
+                    ScalarType beta)
+     {
+       assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size check failed at C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
+       assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
+       assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size check failed at C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
+ 
+       switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
@@@ -827,102 -788,6 +1244,105 @@@
    //
  
  
++<<<<<<< HEAD
 +
 +
 +
 +  //v = A * x
 +  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
 +  *
 +  * @param proxy  An expression template proxy class.
 +  */
 +  template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +  template <typename F, unsigned int MAT_ALIGNMENT>
 +  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
 +  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                                        const viennacl::vector<SCALARTYPE, ALIGNMENT>,
 +                                                                                        viennacl::op_prod> & proxy) 
 +  {
 +    // check for the special case x = A * x
 +    if (proxy.rhs().handle().get() == this->handle().get())
 +    {
 +      viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
 +      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
 +      *this = result;
 +    }
 +    else
 +    {
 +      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +    }
 +    return *this;
 +  }
 +
 +
 +  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix and v1, v2 are vector ranges
 +  *
 +  * @param proxy  An expression template proxy class.
 +  */
 +  template <typename VectorType>
 +  template <typename MatrixType>
 +  typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value,
 +                                viennacl::vector_range<VectorType> & >::type
 +  viennacl::vector_range<VectorType>::operator=(const vector_expression< const MatrixType,
 +                                                                         const viennacl::vector_range<VectorType>,
 +                                                                         op_prod> & proxy)
 +  {
 +    typedef typename viennacl::result_of::cpu_value_type<VectorType>::type   cpu_value_type;
 +    
 +    
 +    // check for the special case x = A * x
 +    if (proxy.rhs().get().handle().get() == this->get().handle().get())
 +    {
 +      viennacl::vector<cpu_value_type> result(proxy.rhs().size());
 +      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
 +      *this = result;
 +    }
 +    else
 +    {
 +      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +    }
 +    return *this;
 +  }
 +
 +
 +  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix and v1, v2 are vector slices
 +  *
 +  * @param proxy  An expression template proxy class.
 +  */
 +  template <typename VectorType>
 +  template <typename MatrixType>
 +  typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value,
 +                                viennacl::vector_slice<VectorType> & >::type
 +  viennacl::vector_slice<VectorType>::operator=(const vector_expression< const MatrixType,
 +                                                                         const viennacl::vector_slice<VectorType>,
 +                                                                         op_prod> & proxy)
 +  {
 +    typedef typename viennacl::result_of::cpu_value_type<VectorType>::type   cpu_value_type;
 +    
 +    
 +    // check for the special case x = A * x
 +    if (proxy.rhs().get().handle().get() == this->get().handle().get())
 +    {
 +      viennacl::vector<cpu_value_type> result(proxy.rhs().size());
 +      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
 +      *this = result;
 +    }
 +    else
 +    {
 +      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +    }
 +    return *this;
 +  }
 +
 +
 +
 +
 +
 +
 +
 +
++=======
++>>>>>>> upstream/1.5.1
    //v += A * x
    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
    *
diff --cc viennacl/linalg/nmf.hpp
index 7136717,e47712d..d833728
--- a/viennacl/linalg/nmf.hpp
+++ b/viennacl/linalg/nmf.hpp
@@@ -2,24 -2,25 +2,40 @@@
  #define VIENNACL_LINALG_NMF_HPP
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
  /** @file viennacl/linalg/nmf.hpp
++<<<<<<< HEAD
 +    @brief Provides a nonnegative matrix factorization implementation.  Experimental in 1.3.x.
 +    
++=======
+     @brief Provides a nonnegative matrix factorization implementation.  Experimental.
+ 
++>>>>>>> upstream/1.5.1
      Contributed by Volodymyr Kysenko.
  */
  
@@@ -28,105 -29,172 +44,274 @@@
  #include "viennacl/matrix.hpp"
  #include "viennacl/linalg/prod.hpp"
  #include "viennacl/linalg/norm_2.hpp"
++<<<<<<< HEAD
 +#include "viennacl/linalg/kernels/nmf_kernels.h"
++=======
+ #include "viennacl/linalg/norm_frobenius.hpp"
+ #include "viennacl/linalg/opencl/kernels/nmf.hpp"
++>>>>>>> upstream/1.5.1
  
  namespace viennacl
  {
    namespace linalg
    {
++<<<<<<< HEAD
 +    //const std::string NMF_PROGRAM_NAME = "elem_wise_ops";
 +    const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
 +    const std::string NMF_SUB_KERNEL = "sub_wise";
 +
 +
 +    template <typename ScalarType>
 +    void nmf(viennacl::matrix<ScalarType> const & v,
 +             viennacl::matrix<ScalarType> & w,
 +             viennacl::matrix<ScalarType> & h,
 +             std::size_t k,
 +             ScalarType eps = 0.000001,
 +             std::size_t max_iter = 10000,
 +             std::size_t check_diff_every_step = 100)
 +    {
 +      viennacl::linalg::kernels::nmf<ScalarType, 1>::init();
 +      
 +      w.resize(v.size1(), k);
 +      h.resize(k, v.size2());
 +
 +      std::vector<ScalarType> stl_w(w.internal_size1() * w.internal_size2());
 +      std::vector<ScalarType> stl_h(h.internal_size1() * h.internal_size2());
 +
 +      for (std::size_t j = 0; j < stl_w.size(); j++)
 +          stl_w[j] = static_cast<ScalarType>(rand()) / RAND_MAX;
 +
 +      for (std::size_t j = 0; j < stl_h.size(); j++)
 +          stl_h[j] = static_cast<ScalarType>(rand()) / RAND_MAX;
 +
 +      viennacl::matrix<ScalarType> wn(v.size1(), k);
 +      viennacl::matrix<ScalarType> wd(v.size1(), k);
 +      viennacl::matrix<ScalarType> wtmp(v.size1(), v.size2());
 +
 +      viennacl::matrix<ScalarType> hn(k, v.size2());
 +      viennacl::matrix<ScalarType> hd(k, v.size2());
 +      viennacl::matrix<ScalarType> htmp(k, k);
 +
 +      viennacl::matrix<ScalarType> appr(v.size1(), v.size2());
 +      viennacl::vector<ScalarType> diff(v.size1() * v.size2());
 +
 +      viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w);
 +      viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h);
 +
 +      ScalarType last_diff = 0.0f;
 +
 +
 +      
 +      for (std::size_t i = 0; i < max_iter; i++)
 +      {
 +        {
 +          hn = viennacl::linalg::prod(trans(w), v);
 +          htmp = viennacl::linalg::prod(trans(w), w);
 +          hd = viennacl::linalg::prod(htmp, h);
 +
 +          viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
 +                                                                             NMF_MUL_DIV_KERNEL);
 +          viennacl::ocl::enqueue(mul_div_kernel(h, hn, hd, cl_uint(stl_h.size())));
 +        }
 +        {
 +          wn = viennacl::linalg::prod(v, trans(h));
 +          wtmp = viennacl::linalg::prod(w, h);
 +          wd = viennacl::linalg::prod(wtmp, trans(h));
 +
 +          viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
 +                                                                             NMF_MUL_DIV_KERNEL);
 +          
 +          viennacl::ocl::enqueue(mul_div_kernel(w, wn, wd, cl_uint(stl_w.size())));
 +        }
 +
 +        if (i % check_diff_every_step == 0)
 +        {
 +          appr = viennacl::linalg::prod(w, h);
 +
 +         viennacl::ocl::kernel & sub_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
 +                                                                        NMF_SUB_KERNEL);
 +          //this is a cheat. i.e save difference of two matrix into vector to get norm_2
 +          viennacl::ocl::enqueue(sub_kernel(appr, v, diff, cl_uint(v.size1() * v.size2())));
 +          ScalarType diff_val = viennacl::linalg::norm_2(diff);
 +
 +          if((diff_val < eps) || (fabs(diff_val - last_diff) < eps))
 +          {
 +              //std::cout << "Breaked at diff - " << diff_val << "\n";
 +              break;
 +          }
 +
 +          last_diff = diff_val;
 +
 +          //printf("Iteration #%lu - %.5f \n", i, diff_val);
 +        }
 +      }
 +      
 +      
++=======
+     /** @brief Configuration class for the nonnegative-matrix-factorization algorithm. Specify tolerances, maximum iteration counts, etc., here. */
+     class nmf_config
+     {
+       public:
+         nmf_config(double val_epsilon = 1e-4,
+                    double val_epsilon_stagnation = 1e-5,
+                    vcl_size_t num_max_iters = 10000,
+                    vcl_size_t num_check_iters = 100)
+          : eps_(val_epsilon), stagnation_eps_(val_epsilon_stagnation),
+            max_iters_(num_max_iters),
+            check_after_steps_( (num_check_iters > 0) ? num_check_iters : 1),
+            print_relative_error_(false),
+            iters_(0) {}
+ 
+         /** @brief Returns the relative tolerance for convergence */
+         double tolerance() const { return eps_; }
+ 
+         /** @brief Sets the relative tolerance for convergence, i.e. norm(V - W * H) / norm(V - W_init * H_init) */
+         void tolerance(double e) { eps_ = e; }
+ 
+         /** @brief Relative tolerance for the stagnation check */
+         double stagnation_tolerance() const { return stagnation_eps_; }
+ 
+         /** @brief Sets the tolerance for the stagnation check (i.e. the minimum required relative change of the residual between two iterations) */
+         void stagnation_tolerance(double e) { stagnation_eps_ = e; }
+ 
+         /** @brief Returns the maximum number of iterations for the NMF algorithm */
+         vcl_size_t max_iterations() const { return max_iters_; }
+         /** @brief Sets the maximum number of iterations for the NMF algorithm */
+         void max_iterations(vcl_size_t m) { max_iters_ = m; }
+ 
+         /** @brief Returns the number of iterations of the last NMF run using this configuration object */
+         vcl_size_t iters() const { return iters_; }
+ 
+ 
+         /** @brief Number of steps after which the convergence of NMF should be checked (again) */
+         vcl_size_t check_after_steps() const { return check_after_steps_; }
+         /** @brief Set the number of steps after which the convergence of NMF should be checked (again) */
+         void check_after_steps(vcl_size_t c) { if (c > 0) check_after_steps_ = c; }
+ 
+         /** @brief Returns the flag specifying whether the relative tolerance should be printed in each iteration */
+         bool print_relative_error() const { return print_relative_error_; }
+         /** @brief Specify whether the relative error should be printed at each convergence check after 'num_check_iters' steps */
+         void print_relative_error(bool b) { print_relative_error_ = b; }
+ 
+         template <typename ScalarType>
+         friend void nmf(viennacl::matrix<ScalarType> const & V,
+                         viennacl::matrix<ScalarType> & W,
+                         viennacl::matrix<ScalarType> & H,
+                         nmf_config const & conf);
+ 
+       private:
+         double eps_;
+         double stagnation_eps_;
+         vcl_size_t max_iters_;
+         vcl_size_t check_after_steps_;
+         bool print_relative_error_;
+         mutable vcl_size_t iters_;
+     };
+ 
+ 
+     /** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+      *
+      * @param V     Input matrix
+      * @param W     First factor
+      * @param H     Second factor
+      * @param conf  A configuration object holding tolerances and the like
+      */
+     template <typename ScalarType>
+     void nmf(viennacl::matrix<ScalarType> const & V,
+              viennacl::matrix<ScalarType> & W,
+              viennacl::matrix<ScalarType> & H,
+              nmf_config const & conf)
+     {
+       viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(V).context());
+ 
+       const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
+ 
+       viennacl::linalg::opencl::kernels::nmf<ScalarType>::init(ctx);
+ 
+       assert(V.size1() == W.size1() && V.size2() == H.size2() && bool("Dimensions of W and H don't allow for V = W * H"));
+       assert(W.size2() == H.size1() && bool("Dimensions of W and H don't match, prod(W, H) impossible"));
+ 
+       vcl_size_t k = W.size2();
+       conf.iters_ = 0;
+ 
+       viennacl::matrix<ScalarType> wn(V.size1(), k);
+       viennacl::matrix<ScalarType> wd(V.size1(), k);
+       viennacl::matrix<ScalarType> wtmp(V.size1(), V.size2());
+ 
+       viennacl::matrix<ScalarType> hn(k, V.size2());
+       viennacl::matrix<ScalarType> hd(k, V.size2());
+       viennacl::matrix<ScalarType> htmp(k, k);
+ 
+       viennacl::matrix<ScalarType> appr(V.size1(), V.size2());
+       viennacl::vector<ScalarType> diff(V.size1() * V.size2());
+ 
+       ScalarType last_diff = 0;
+       ScalarType diff_init = 0;
+       bool stagnation_flag = false;
+ 
+ 
+       for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+       {
+         conf.iters_ = i + 1;
+         {
+           hn   = viennacl::linalg::prod(trans(W), V);
+           htmp = viennacl::linalg::prod(trans(W), W);
+           hd   = viennacl::linalg::prod(htmp, H);
+ 
+           viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<ScalarType>::program_name(), NMF_MUL_DIV_KERNEL);
+           viennacl::ocl::enqueue(mul_div_kernel(H, hn, hd, cl_uint(H.internal_size1() * H.internal_size2())));
+         }
+         {
+           wn   = viennacl::linalg::prod(V, trans(H));
+           wtmp = viennacl::linalg::prod(W, H);
+           wd   = viennacl::linalg::prod(wtmp, trans(H));
+ 
+           viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<ScalarType>::program_name(), NMF_MUL_DIV_KERNEL);
+ 
+           viennacl::ocl::enqueue(mul_div_kernel(W, wn, wd, cl_uint(W.internal_size1() * W.internal_size2())));
+         }
+ 
+         if (i % conf.check_after_steps() == 0)  //check for convergence
+         {
+           appr = viennacl::linalg::prod(W, H);
+ 
+           appr -= V;
+           ScalarType diff_val = viennacl::linalg::norm_frobenius(appr);
+ 
+           if (i == 0)
+             diff_init = diff_val;
+ 
+           if (conf.print_relative_error())
+             std::cout << diff_val / diff_init << std::endl;
+ 
+           // Approximation check
+           if (diff_val / diff_init < conf.tolerance())
+             break;
+ 
+           // Stagnation check
+           if (std::fabs(diff_val - last_diff) / (diff_val * conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+           {
+             if (stagnation_flag)       // iteration stagnates (two iterates with no notable progress)
+               break;
+             else                       // record stagnation in this iteration
+               stagnation_flag = true;
+           }
+           else                         // good progress in this iteration, so unset stagnation flag
+             stagnation_flag = false;
+ 
+           // prepare for next iterate:
+           last_diff = diff_val;
+         }
+       }
+ 
+ 
++>>>>>>> upstream/1.5.1
      }
    }
  }
  
- #endif
++<<<<<<< HEAD
++#endif
++=======
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/linalg/norm_1.hpp
index b6d0cdc,42c6e02..6487a33
--- a/viennacl/linalg/norm_1.hpp
+++ b/viennacl/linalg/norm_1.hpp
@@@ -86,27 -82,25 +82,49 @@@ namespace viennac
                                            viennacl::op_norm_1 >(vector, vector);
      }
  
++<<<<<<< HEAD
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                 const viennacl::vector_range<VectorType>,
 +                                 viennacl::op_norm_1 >
 +    norm_1(viennacl::vector_range<VectorType> const & vector)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                          const viennacl::vector_range<VectorType>,
 +                                          viennacl::op_norm_1 >(vector, vector);
 +    }
 +
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                 const viennacl::vector_slice<VectorType>,
 +                                 viennacl::op_norm_1 >
 +    norm_1(viennacl::vector_slice<VectorType> const & vector)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                          const viennacl::vector_slice<VectorType>,
 +                                          viennacl::op_norm_1 >(vector, vector);
 +    }
++=======
+     // with vector expression:
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                 const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                 viennacl::op_norm_1>
+     norm_1(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+     {
+       return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                           const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                           viennacl::op_norm_1 >(vector, vector);
+     }
+ 
+     // with matrix
+     /*template<typename NumericT, typename F>
+     scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_1>
+     norm_1(const matrix<NumericT, F> & A)
+     {
+       return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_1>(A, A);
+     }*/
++>>>>>>> upstream/1.5.1
  
    } // end namespace linalg
  } // end namespace viennacl
diff --cc viennacl/linalg/norm_2.hpp
index e10bb9a,e716ce3..20d21c3
--- a/viennacl/linalg/norm_2.hpp
+++ b/viennacl/linalg/norm_2.hpp
@@@ -153,28 -107,18 +107,43 @@@ namespace viennac
                                            viennacl::op_norm_2 >(v, v);
      }
  
++<<<<<<< HEAD
 +
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                 const viennacl::vector_range<VectorType>,
 +                                 viennacl::op_norm_2 >
 +    norm_2(viennacl::vector_range<VectorType> const & vector)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                          const viennacl::vector_range<VectorType>,
 +                                          viennacl::op_norm_2 >(vector, vector);
 +    }
 +
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                 const viennacl::vector_slice<VectorType>,
 +                                 viennacl::op_norm_2 >
 +    norm_2(viennacl::vector_slice<VectorType> const & vector)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                          const viennacl::vector_slice<VectorType>,
 +                                          viennacl::op_norm_2 >(vector, vector);
 +    }
++=======
+     // with vector expression:
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                 const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                 viennacl::op_norm_2>
+     norm_2(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+     {
+       return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                           const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                           viennacl::op_norm_2>(vector, vector);
+     }
+ 
++>>>>>>> upstream/1.5.1
  
    } // end namespace linalg
  } // end namespace viennacl
diff --cc viennacl/linalg/norm_inf.hpp
index 3dc1b48,b8d15eb..ab5681c
--- a/viennacl/linalg/norm_inf.hpp
+++ b/viennacl/linalg/norm_inf.hpp
@@@ -90,27 -85,27 +85,51 @@@ namespace viennac
                                            viennacl::op_norm_inf >(v1, v1);
      }
  
++<<<<<<< HEAD
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                 const viennacl::vector_range<VectorType>,
 +                                 viennacl::op_norm_inf >
 +    norm_inf(viennacl::vector_range<VectorType> const & vector)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
 +                                          const viennacl::vector_range<VectorType>,
 +                                          viennacl::op_norm_inf >(vector, vector);
 +    }
 +
 +    template< typename VectorType >
 +    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                 const viennacl::vector_slice<VectorType>,
 +                                 viennacl::op_norm_inf >
 +    norm_inf(viennacl::vector_slice<VectorType> const & vector)
 +    {
 +      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
 +                                          const viennacl::vector_slice<VectorType>,
 +                                          viennacl::op_norm_inf >(vector, vector);
 +    }
++=======
+     // with vector expression:
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                 const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                 viennacl::op_norm_inf>
+     norm_inf(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+     {
+       return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                           const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                           viennacl::op_norm_inf >(vector, vector);
+     }
+ 
+     // with matrix:
+     /*
+     template<typename NumericT, typename F>
+     scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_inf>
+     norm_inf(const matrix<NumericT, F> & A)
+     {
+       return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_inf>(A, A);
+     }*/
+ 
++>>>>>>> upstream/1.5.1
  
    } // end namespace linalg
  } // end namespace viennacl
diff --cc viennacl/linalg/power_iter.hpp
index c9674fa,75ee20d..bfe6026
--- a/viennacl/linalg/power_iter.hpp
+++ b/viennacl/linalg/power_iter.hpp
@@@ -2,24 -2,25 +2,41 @@@
  #define VIENNACL_LINALG_POWER_ITER_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2011, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
++<<<<<<< HEAD
 +/** @file power_iter.hpp
 +    @brief Defines a tag for the configuration of the power iteration method.
 +    
++=======
+ /** @file viennacl/linalg/power_iter.hpp
+     @brief Defines a tag for the configuration of the power iteration method.
+ 
++>>>>>>> upstream/1.5.1
      Contributed by Astrid Rupp.
  */
  
@@@ -31,40 -32,40 +48,72 @@@
  
  namespace viennacl
  {
++<<<<<<< HEAD
 +  namespace linalg 
 +  {
 +    /** @brief A tag for the power iteration algorithm. */
 +    class power_iter_tag 
 +    {
 +      public:
 +        
++=======
+   namespace linalg
+   {
+     /** @brief A tag for the power iteration algorithm. */
+     class power_iter_tag
+     {
+       public:
+ 
++>>>>>>> upstream/1.5.1
          /** @brief The constructor
          *
          * @param tfac      If the eigenvalue does not change more than this termination factor, the algorithm stops
          * @param max_iters Maximum number of iterations for the power iteration
          */
++<<<<<<< HEAD
 +        power_iter_tag(double tfac = 1e-8, std::size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
++=======
+         power_iter_tag(double tfac = 1e-8, vcl_size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
++>>>>>>> upstream/1.5.1
  
          /** @brief Sets the factor for termination */
          void factor(double fct){ termination_factor_ = fct; }
  
            /** @brief Returns the factor for termination */
          double factor() const { return termination_factor_; }
++<<<<<<< HEAD
 +        
 +        std::size_t max_iterations() const { return max_iterations_; }
 +        void max_iterations(std::size_t new_max) { max_iterations_ = new_max; }
 +
 +      private: 
 +        double termination_factor_;
 +        std::size_t max_iterations_;
 +
 +    };
 +  
 +   /** 
 +    *   @brief Implementation of the calculation of eigenvalues using poweriteration
 +    *   
 +    *   @param matrix        The system matrix
 +    *   @param tag           Tag with termination factor 
++=======
+ 
+         vcl_size_t max_iterations() const { return max_iterations_; }
+         void max_iterations(vcl_size_t new_max) { max_iterations_ = new_max; }
+ 
+       private:
+         double termination_factor_;
+         vcl_size_t max_iterations_;
+ 
+     };
+ 
+    /**
+     *   @brief Implementation of the calculation of eigenvalues using poweriteration
+     *
+     *   @param matrix        The system matrix
+     *   @param tag           Tag with termination factor
++>>>>>>> upstream/1.5.1
      *   @return              Returns the largest eigenvalue computed by the power iteration method
      */
      template< typename MatrixT >
@@@ -75,31 -76,33 +124,57 @@@
        typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
        typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
++<<<<<<< HEAD
 +    
 +      CPU_ScalarType eigenvalue;
 +      long matrix_size = matrix.size1();
 +      VectorT r(matrix_size);
 +      std::vector<CPU_ScalarType> s(matrix_size);
 +      
 +      for(std::size_t i=0; i<s.size(); ++i)
++=======
+ 
+       CPU_ScalarType eigenvalue;
+       vcl_size_t matrix_size = matrix.size1();
+       VectorT r(matrix_size);
+       VectorT r2(matrix_size);
+       std::vector<CPU_ScalarType> s(matrix_size);
+ 
+       for(vcl_size_t i=0; i<s.size(); ++i)
++>>>>>>> upstream/1.5.1
          s[i] = (i % 3) * CPU_ScalarType(0.1234) - CPU_ScalarType(0.5);   //'random' starting vector
  
        detail::copy_vec_to_vec(s,r);
  
        //std::cout << s << std::endl;
++<<<<<<< HEAD
 +      
++=======
+ 
++>>>>>>> upstream/1.5.1
        double epsilon = tag.factor();
        CPU_ScalarType norm = norm_2(r);
        CPU_ScalarType norm_prev = 0;
        long numiter = 0;
  
++<<<<<<< HEAD
 +      for (std::size_t i=0; i<tag.max_iterations(); ++i)
 +      {
 +        if (std::abs<CPU_ScalarType>(norm - norm_prev) / std::abs<CPU_ScalarType>(norm) < epsilon)
 +          break; 
 +           
 +        r /= norm;
 +        r = viennacl::linalg::prod(matrix, r);
++=======
+       for (vcl_size_t i=0; i<tag.max_iterations(); ++i)
+       {
+         if (std::fabs(norm - norm_prev) / std::fabs(norm) < epsilon)
+           break;
+ 
+         r /= norm;
+         r2 = viennacl::linalg::prod(matrix, r);  //using helper vector r2 for the computation of r <- A * r in order to avoid the repeated creation of temporaries
+         r = r2;
++>>>>>>> upstream/1.5.1
          norm_prev = norm;
          norm = norm_2(r);
          numiter++;
@@@ -112,4 -115,4 +187,8 @@@
  
    } // end namespace linalg
  } // end namespace viennacl
- #endif
++<<<<<<< HEAD
++#endif
++=======
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/linalg/prod.hpp
index dde1f3d,702fc8f..9281018
--- a/viennacl/linalg/prod.hpp
+++ b/viennacl/linalg/prod.hpp
@@@ -137,69 -133,59 +133,100 @@@ namespace viennac
      // ----------------------------------------------------
      // VIENNACL
      //
-     template< typename MatrixT1, typename MatrixT2 >
-     viennacl::matrix_expression< const MatrixT1, 
-                                  const viennacl::matrix_range<MatrixT2>,
-                                  viennacl::op_prod >
-     prod(MatrixT1 const& A,
-          viennacl::matrix_range<MatrixT2> const& B)
+ 
+     // standard product:
+     template< typename NumericT, typename F1, typename F2>
+     viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                  const viennacl::matrix_base<NumericT, F2>,
+                                  viennacl::op_mat_mat_prod >
+     prod(viennacl::matrix_base<NumericT, F1> const & A,
+          viennacl::matrix_base<NumericT, F2> const & B)
      {
        // std::cout << "viennacl .. " << std::endl;
-       return viennacl::matrix_expression< const MatrixT1, 
-                                           const viennacl::matrix_range<MatrixT2>,
-                                           viennacl::op_prod >(A, B);
+       return viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                           const viennacl::matrix_base<NumericT, F2>,
+                                           viennacl::op_mat_mat_prod >(A, B);
      }
  
++<<<<<<< HEAD
 +    template< typename MatrixT1, typename MatrixT2 >
 +    viennacl::matrix_expression< const MatrixT1, 
 +                                 const viennacl::matrix_slice<MatrixT2>,
 +                                 viennacl::op_prod >
 +    prod(MatrixT1 const& A,
 +         viennacl::matrix_slice<MatrixT2> const& B)
 +    {
 +      // std::cout << "viennacl .. " << std::endl;
 +      return viennacl::matrix_expression< const MatrixT1, 
 +                                          const viennacl::matrix_slice<MatrixT2>,
 +                                          viennacl::op_prod >(A, B);
 +    }
 +
 +
 +    template< typename MatrixT1, typename MatrixT2 >
 +    viennacl::matrix_expression< const MatrixT1, 
 +                                 const viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
 +                                                                   const viennacl::matrix_range<MatrixT2>,
++=======
+     // right factor is transposed:
+     template< typename NumericT, typename F1, typename F2>
+     viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                  const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                                                    const viennacl::matrix_base<NumericT, F2>,
++>>>>>>> upstream/1.5.1
                                                                     op_trans>,
-                                  viennacl::op_prod >
-     prod(MatrixT1 const & A,
-          viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
-                                      const viennacl::matrix_range<MatrixT2>,
+                                  viennacl::op_mat_mat_prod >
+     prod(viennacl::matrix_base<NumericT, F1> const & A,
+          viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                      const viennacl::matrix_base<NumericT, F2>,
                                       op_trans> const & B)
      {
        // std::cout << "viennacl .. " << std::endl;
-       return viennacl::matrix_expression< const MatrixT1, 
-                                           const viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
-                                                                             const viennacl::matrix_range<MatrixT2>,
+       return viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                           const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                                                             const viennacl::matrix_base<NumericT, F2>,
                                                                              op_trans>,
-                                           viennacl::op_prod >(A, B);
+                                           viennacl::op_mat_mat_prod >(A, B);
      }
  
++<<<<<<< HEAD
 +    template< typename MatrixT1, typename MatrixT2 >
 +    viennacl::matrix_expression< const MatrixT1, 
 +                                 const viennacl::matrix_expression<const viennacl::matrix_slice<MatrixT2>,
 +                                                                   const viennacl::matrix_slice<MatrixT2>,
 +                                                                   op_trans>,
 +                                 viennacl::op_prod >
 +    prod(MatrixT1 const & A,
 +         viennacl::matrix_expression<const viennacl::matrix_slice<MatrixT2>,
 +                                     const viennacl::matrix_slice<MatrixT2>,
 +                                     op_trans> const & B)
 +    {
 +      // std::cout << "viennacl .. " << std::endl;
 +      return viennacl::matrix_expression< const MatrixT1, 
 +                                          const viennacl::matrix_expression<const viennacl::matrix_slice<MatrixT2>,
 +                                                                            const viennacl::matrix_slice<MatrixT2>,
 +                                                                            op_trans>,
 +                                          viennacl::op_prod >(A, B);
++=======
+     // left factor transposed:
+     template< typename NumericT, typename F1, typename F2>
+     viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                                                    const viennacl::matrix_base<NumericT, F1>,
+                                                                    op_trans>,
+                                  const viennacl::matrix_base<NumericT, F2>,
+                                  viennacl::op_mat_mat_prod >
+     prod(viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                      const viennacl::matrix_base<NumericT, F1>,
+                                      op_trans> const & A,
+          viennacl::matrix_base<NumericT, F2> const & B)
+     {
+       // std::cout << "viennacl .. " << std::endl;
+       return viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                                                             const viennacl::matrix_base<NumericT, F1>,
+                                                                             op_trans>,
+                                           const viennacl::matrix_base<NumericT, F2>,
+                                           viennacl::op_mat_mat_prod >(A, B);
++>>>>>>> upstream/1.5.1
      }
  
  
@@@ -206,78 -217,102 +258,142 @@@
  
  
  
- 
-     template< typename MatrixT, typename NumericT, unsigned int ALIGNMENT >
-     viennacl::vector_expression< const MatrixT, 
-                                  const viennacl::vector<NumericT, ALIGNMENT>,
+     // matrix-vector product
+     template< typename NumericT, typename F>
+     viennacl::vector_expression< const viennacl::matrix_base<NumericT, F>,
+                                  const viennacl::vector_base<NumericT>,
                                   viennacl::op_prod >
++<<<<<<< HEAD
 +    prod(MatrixT const& matrix,
 +         viennacl::vector<NumericT, ALIGNMENT> const & vector)
++=======
+     prod(viennacl::matrix_base<NumericT, F> const & matrix,
+          viennacl::vector_base<NumericT> const & vector)
++>>>>>>> upstream/1.5.1
      {
        // std::cout << "viennacl .. " << std::endl;
-       return viennacl::linalg::prod_impl(matrix, vector);
+       return viennacl::vector_expression< const viennacl::matrix_base<NumericT, F>,
+                                           const viennacl::vector_base<NumericT>,
+                                           viennacl::op_prod >(matrix, vector);
      }
  
++<<<<<<< HEAD
 +    template< typename MatrixT, typename VectorType >
 +    viennacl::vector_expression< const MatrixT, 
 +                                 const viennacl::vector_range<VectorType>,
 +                                 viennacl::op_prod >
 +    prod(MatrixT const& matrix,
 +         viennacl::vector_range<VectorType> const & vector)
 +    {
 +      // std::cout << "viennacl .. " << std::endl;
 +      return viennacl::vector_expression< const MatrixT, 
 +                                          const viennacl::vector_range<VectorType>,
 +                                          viennacl::op_prod >(matrix, vector);
 +    }
 +
 +    template< typename MatrixT, typename VectorType >
 +    viennacl::vector_expression< const MatrixT, 
 +                                 const viennacl::vector_slice<VectorType>,
 +                                 viennacl::op_prod >
 +    prod(MatrixT const& matrix,
 +         viennacl::vector_slice<VectorType> const & vector)
 +    {
 +      // std::cout << "viennacl .. " << std::endl;
 +      return viennacl::vector_expression< const MatrixT, 
 +                                          const viennacl::vector_slice<VectorType>,
 +                                          viennacl::op_prod >(matrix, vector);
 +    }
 +
 +
 +
 +
 +    template< typename MatrixT, typename NumericT, typename F, unsigned int ALIGNMENT >
 +    viennacl::matrix_expression< const MatrixT, 
 +                                 const viennacl::matrix<NumericT, F, ALIGNMENT>,
++=======
+     // transposed matrix-vector product
+     template< typename NumericT, typename F>
+     viennacl::vector_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
+                                                                    const viennacl::matrix_base<NumericT, F>,
+                                                                    op_trans>,
+                                  const viennacl::vector_base<NumericT>,
++>>>>>>> upstream/1.5.1
                                   viennacl::op_prod >
-     prod(MatrixT const& matrix_A,
-          viennacl::matrix<NumericT, F, ALIGNMENT> const& matrix_B)
+     prod(viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
+                                      const viennacl::matrix_base<NumericT, F>,
+                                      op_trans> const & matrix,
+          viennacl::vector_base<NumericT> const & vector)
      {
        // std::cout << "viennacl .. " << std::endl;
-       return viennacl::matrix_expression< const MatrixT, 
-                                           const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                           viennacl::op_prod >(matrix_A, matrix_B);
+       return viennacl::vector_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
+                                                                             const viennacl::matrix_base<NumericT, F>,
+                                                                             op_trans>,
+                                           const viennacl::vector_base<NumericT>,
+                                           viennacl::op_prod >(matrix, vector);
      }
  
-     template< typename MatrixT, typename NumericT, typename F, unsigned int ALIGNMENT >
-     viennacl::matrix_expression< const MatrixT, 
-                                  const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
-                                                                     const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                                                     viennacl::op_trans >,
-                                  viennacl::op_prod >
-     prod(MatrixT const& matrix_A,
-          const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
-                                             const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                             viennacl::op_trans > & matrix_B)
+ 
+     template<typename SparseMatrixType, class SCALARTYPE>
+     typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                   vector_expression<const SparseMatrixType,
+                                                     const vector_base<SCALARTYPE>,
+                                                     op_prod >
+                                  >::type
+     prod(const SparseMatrixType & mat,
+          const vector_base<SCALARTYPE> & vec)
      {
-       // std::cout << "viennacl .. " << std::endl;
-       return viennacl::matrix_expression< const MatrixT, 
-                                           const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
-                                                                              const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                                                              viennacl::op_trans >,
-                                           viennacl::op_prod >(matrix_A, matrix_B);
-       //return viennacl::linalg::prod_impl(matrix_A, matrix_B);
+       return vector_expression<const SparseMatrixType,
+                                const vector_base<SCALARTYPE>,
+                                op_prod >(mat, vec);
+     }
+ 
+     template< typename SparseMatrixType, typename SCALARTYPE, typename F1>
+     typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                   viennacl::matrix_expression<const SparseMatrixType,
+                                                               const matrix_base < SCALARTYPE, F1 >,
+                                                               op_prod >
+                                  >::type
+     prod(const SparseMatrixType & sp_mat,
+          const viennacl::matrix_base<SCALARTYPE, F1> & d_mat)
+     {
+       return viennacl::matrix_expression<const SparseMatrixType,
+                                          const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                          op_prod >(sp_mat, d_mat);
+     }
+ 
+     // right factor is transposed
+     template< typename SparseMatrixType, typename SCALARTYPE, typename F1 >
+     typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                   viennacl::matrix_expression< const SparseMatrixType,
+                                                                const viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE, F1>,
+                                                                                                  const viennacl::matrix_base<SCALARTYPE, F1>,
+                                                                                                  op_trans>,
+                                                                viennacl::op_prod >
+                                   >::type
+     prod(const SparseMatrixType & A,
+          viennacl::matrix_expression<const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                      const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                      op_trans> const & B)
+     {
+       return viennacl::matrix_expression< const SparseMatrixType,
+                                           const viennacl::matrix_expression<const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                                                             const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                                                             op_trans>,
+                                           viennacl::op_prod >(A, B);
+     }
+ 
+     template<typename StructuredMatrixType, class SCALARTYPE>
+     typename viennacl::enable_if< viennacl::is_any_dense_structured_matrix<StructuredMatrixType>::value,
+                                   vector_expression<const StructuredMatrixType,
+                                                     const vector_base<SCALARTYPE>,
+                                                     op_prod >
+                                  >::type
+     prod(const StructuredMatrixType & mat,
+          const vector_base<SCALARTYPE> & vec)
+     {
+       return vector_expression<const StructuredMatrixType,
+                                const vector_base<SCALARTYPE>,
+                                op_prod >(mat, vec);
      }
  
    } // end namespace linalg
diff --cc viennacl/linalg/qr.hpp
index 07286bb,34b63ca..7935eaa
--- a/viennacl/linalg/qr.hpp
+++ b/viennacl/linalg/qr.hpp
@@@ -320,41 -217,40 +217,77 @@@ namespace viennac
          project( A, range(j+1, A.size1()), range(j, j+1) ) = project(v, range(j+1, A.size1()), range(0, 1) );;
        }
  
++<<<<<<< HEAD
 +      
 +
 +      /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
 +      * 
++=======
+ 
+ 
+       /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A
+       *
++>>>>>>> upstream/1.5.1
        * @param A            A dense compatible to Boost.uBLAS
        * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
        */
        template<typename MatrixType>
++<<<<<<< HEAD
 +      std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, std::size_t block_size = 32)
 +      {
 +        typedef typename MatrixType::value_type   ScalarType;
 +        typedef boost::numeric::ublas::matrix_range<MatrixType>  MatrixRange;
 +        
 +        using boost::numeric::ublas::range;
 +        using boost::numeric::ublas::project;
 +        
 +        std::vector<ScalarType> betas(A.size2());
 +        //boost::numeric::ublas::vector<ScalarType> v(A.size1());
 +        MatrixType v(A.size1(), 1);
 +        MatrixType matrix_1x1(1,1);
 +
 +        MatrixType Y(A.size1(), block_size); Y.clear(); Y.resize(A.size1(), block_size);
 +        MatrixType W(A.size1(), block_size); W.clear(); W.resize(A.size1(), block_size);
 +          
 +        //run over A in a block-wise manner:
 +        for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
 +        {
 +          std::size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
 +          
 +          //determine Householder vectors:
 +          for (std::size_t k = 0; k < effective_block_size; ++k)
 +          {
 +            betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
 +            
 +            for (std::size_t l = k; l < effective_block_size; ++l)
++=======
+       std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, vcl_size_t block_size = 32)
+       {
+         typedef typename MatrixType::value_type   ScalarType;
+         typedef boost::numeric::ublas::matrix_range<MatrixType>  MatrixRange;
+ 
+         using boost::numeric::ublas::range;
+         using boost::numeric::ublas::project;
+ 
+         std::vector<ScalarType> betas(A.size2());
+         MatrixType v(A.size1(), 1);
+         MatrixType matrix_1x1(1,1);
+ 
+         MatrixType Y(A.size1(), block_size); Y.clear(); Y.resize(A.size1(), block_size);
+         MatrixType W(A.size1(), block_size); W.clear(); W.resize(A.size1(), block_size);
+ 
+         //run over A in a block-wise manner:
+         for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+         {
+           vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
+ 
+           //determine Householder vectors:
+           for (vcl_size_t k = 0; k < effective_block_size; ++k)
+           {
+             betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
+ 
+             for (vcl_size_t l = k; l < effective_block_size; ++l)
++>>>>>>> upstream/1.5.1
                detail::householder_reflect_ublas(A, v, matrix_1x1, betas[j+k], j+k, j+l);
  
              detail::write_householder_to_A_ublas(A, v, j+k);
@@@ -364,153 -260,147 +297,290 @@@
            // Setup Y:
            //
            Y.clear();  Y.resize(A.size1(), block_size);
++<<<<<<< HEAD
 +          for (std::size_t k = 0; k < effective_block_size; ++k)
 +          {
 +            //write Householder to Y:
 +            Y(j+k,k) = 1.0;
 +            project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
 +          }
 +          
 +          //
 +          // Setup W:
 +          //
 +          
 +          //first vector:
 +          W.clear();  W.resize(A.size1(), block_size);
 +          W(j, 0) = -betas[j];
 +          project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
 +          
 +          
 +          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
 +          for (std::size_t k = 1; k < effective_block_size; ++k)
 +          {
 +            MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
 +            MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
 +            MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
 +            MatrixRange z     = project(W, range(j, A.size1()), range(k, k+1));
 +            
 +            MatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
 +            z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
 +          }
 +
 +          //
 +          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
 +          //
 +          
 +          if (A.size2() - j - effective_block_size > 0)
 +          {
 +            
 +            MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
 +            MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
 +            MatrixType temp = boost::numeric::ublas::prod(trans(W_part), A_part);
 +            
 +            A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
 +                          temp);
 +          }
 +        }
 +
 +        return betas;
 +      }
 +
 +
 +      /** @brief Implementation of a OpenCL-only QR factorization for GPUs (or multi-core CPU). DEPRECATED! Use only if you're curious and interested in playing a bit with a GPU-only implementation.
 +      * 
 +      * Performance is rather poor at small matrix sizes.
 +      * Prefer the use of the hybrid version, which is automatically chosen using the interface function inplace_qr()
 +      * 
 +      * @param A            A dense ViennaCL matrix to be factored
 +      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
 +      */
 +      template<typename MatrixType>
 +      std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
 +      inplace_qr_viennacl(MatrixType & A, std::size_t block_size = 16)
 +      {
 +        typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
 +        typedef viennacl::matrix_range<MatrixType>  MatrixRange;
 +        
 +        //using boost::numeric::ublas::range;
 +        //using boost::numeric::ublas::project;
 +        using viennacl::range;
 +        using viennacl::project;
 +        
 +        std::vector<ScalarType> betas(A.size2());
 +        //boost::numeric::ublas::vector<ScalarType> v(A.size1());
 +        MatrixType v(A.size1(), 1);
 +        MatrixType matrix_1x1(1,1);
 +
 +        MatrixType Y(A.size1(), block_size); Y.clear();
 +        MatrixType W(A.size1(), block_size); W.clear();
 +
 +        MatrixType YT_prod_v(block_size, 1);
 +        MatrixType z(A.size1(), 1);      
 +        
 +        //run over A in a block-wise manner:
 +        for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
 +        {
 +          std::size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
 +          
 +          //determine Householder vectors:
 +          for (std::size_t k = 0; k < effective_block_size; ++k)
 +          {
 +            betas[j+k] = detail::setup_householder_vector_viennacl(A, v, matrix_1x1, j+k);
 +            for (std::size_t l = k; l < effective_block_size; ++l)
 +              detail::householder_reflect_viennacl(A, v, matrix_1x1, betas[j+k], j+k, j+l);
 +
 +            detail::write_householder_to_A_viennacl(A, v, j+k);
 +          }
++=======
+           for (vcl_size_t k = 0; k < effective_block_size; ++k)
+           {
+             //write Householder to Y:
+             Y(j+k,k) = 1.0;
+             project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
+           }
+ 
+           //
+           // Setup W:
+           //
+ 
+           //first vector:
+           W.clear();  W.resize(A.size1(), block_size);
+           W(j, 0) = -betas[j];
+           project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
+ 
+ 
+           //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+           for (vcl_size_t k = 1; k < effective_block_size; ++k)
+           {
+             MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
+             MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
+             MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
+             MatrixRange z     = project(W, range(j, A.size1()), range(k, k+1));
+ 
+             MatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
+             z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
+           }
+ 
+           //
+           //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+           //
+ 
+           if (A.size2() - j - effective_block_size > 0)
+           {
+ 
+             MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
+             MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
+             MatrixType temp = boost::numeric::ublas::prod(trans(W_part), A_part);
+ 
+             A_part += prod(project(Y, range(j, A.size1()), range(0, effective_block_size)),
+                           temp);
+           }
+         }
+ 
+         return betas;
+       }
+ 
+ 
+       /** @brief Implementation of a OpenCL-only QR factorization for GPUs (or multi-core CPU). DEPRECATED! Use only if you're curious and interested in playing a bit with a GPU-only implementation.
+       *
+       * Performance is rather poor at small matrix sizes.
+       * Prefer the use of the hybrid version, which is automatically chosen using the interface function inplace_qr()
+       *
+       * @param A            A dense ViennaCL matrix to be factored
+       * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+       */
+       template<typename MatrixType>
+       std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type >
+       inplace_qr_viennacl(MatrixType & A, vcl_size_t block_size = 16)
+       {
+         typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
+         typedef viennacl::matrix_range<MatrixType>  MatrixRange;
+ 
+         using viennacl::range;
+         using viennacl::project;
+ 
+         std::vector<ScalarType> betas(A.size2());
+         MatrixType v(A.size1(), 1);
+         MatrixType matrix_1x1(1,1);
+ 
+         MatrixType Y(A.size1(), block_size); Y.clear();
+         MatrixType W(A.size1(), block_size); W.clear();
+ 
+         MatrixType YT_prod_v(block_size, 1);
+         MatrixType z(A.size1(), 1);
+ 
+         //run over A in a block-wise manner:
+         for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+         {
+           vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
+ 
+           //determine Householder vectors:
+           for (vcl_size_t k = 0; k < effective_block_size; ++k)
+           {
+             betas[j+k] = detail::setup_householder_vector_viennacl(A, v, matrix_1x1, j+k);
+             for (vcl_size_t l = k; l < effective_block_size; ++l)
+               detail::householder_reflect_viennacl(A, v, matrix_1x1, betas[j+k], j+k, j+l);
+ 
+             detail::write_householder_to_A_viennacl(A, v, j+k);
+           }
++>>>>>>> upstream/1.5.1
  
            //
            // Setup Y:
            //
            Y.clear();
++<<<<<<< HEAD
 +          for (std::size_t k = 0; k < effective_block_size; ++k)
++=======
+           for (vcl_size_t k = 0; k < effective_block_size; ++k)
++>>>>>>> upstream/1.5.1
            {
              //write Householder to Y:
              Y(j+k,k) = 1.0;
              project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
            }
++<<<<<<< HEAD
 +          
 +          //
 +          // Setup W:
 +          //
 +          
++=======
+ 
+           //
+           // Setup W:
+           //
+ 
++>>>>>>> upstream/1.5.1
            //first vector:
            W.clear();
            W(j, 0) = -betas[j];
            //project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
            project(W, range(j+1, A.size1()), range(0, 1)) = project(A, range(j+1, A.size1()), range(j, j+1));
            project(W, range(j+1, A.size1()), range(0, 1)) *= -betas[j];
++<<<<<<< HEAD
 +          
 +          
 +          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
 +          for (std::size_t k = 1; k < effective_block_size; ++k)
 +          {
 +            MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
 +            MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
 +            MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
 +            //MatrixRange z     = project(W, range(0, A.size1()), range(k, k+1));
 +          
 +            //std::cout << "should: " << k << std::endl;
 +            project(YT_prod_v, range(0, k), range(0,1)) = prod(trans(Y_old), v_k);
 +            project(z, range(j, A.size1()), range(0,1)) = prod(W_old, project(YT_prod_v, range(0, k), range(0,1)));
 +            //project(W, range(0, A.size1()), range(k, k+1)) = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
 +            project(W, range(j, A.size1()), range(k, k+1)) = project(z, range(j, A.size1()), range(0,1));
 +            project(W, range(j, A.size1()), range(k, k+1)) += v_k;
 +            project(W, range(j, A.size1()), range(k, k+1)) *= - betas[j+k];
 +          }
 +
 +          //
 +          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
 +          //
 +          
 +          if (A.size2() - j - effective_block_size > 0)
 +          {
 +            
 +            MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
 +            MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
 +            MatrixType temp = prod(trans(W_part), A_part);
 +            
 +            A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
++=======
+ 
+ 
+           //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+           for (vcl_size_t k = 1; k < effective_block_size; ++k)
+           {
+             MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
+             MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
+             MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
+ 
+             project(YT_prod_v, range(0, k), range(0,1)) = prod(trans(Y_old), v_k);
+             project(z, range(j, A.size1()), range(0,1)) = prod(W_old, project(YT_prod_v, range(0, k), range(0,1)));
+             project(W, range(j, A.size1()), range(k, k+1)) = project(z, range(j, A.size1()), range(0,1));
+             project(W, range(j, A.size1()), range(k, k+1)) += v_k;
+             project(W, range(j, A.size1()), range(k, k+1)) *= - betas[j+k];
+           }
+ 
+           //
+           //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+           //
+ 
+           if (A.size2() > j + effective_block_size)
+           {
+ 
+             MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
+             MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
+             MatrixType temp = prod(trans(W_part), A_part);
+ 
+             A_part += prod(project(Y, range(j, A.size1()), range(0, effective_block_size)),
++>>>>>>> upstream/1.5.1
                            temp);
            }
          }
@@@ -524,43 -414,40 +594,73 @@@
  
  
        //MatrixType is ViennaCL-matrix
++<<<<<<< HEAD
 +      /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU) 
 +      * 
 +      * Prefer the use of the convenience interface inplace_qr()
 +      * 
++=======
+       /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU)
+       *
+       * Prefer the use of the convenience interface inplace_qr()
+       *
++>>>>>>> upstream/1.5.1
        * @param A            A dense ViennaCL matrix to be factored
        * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
        */
        template<typename MatrixType>
++<<<<<<< HEAD
 +      std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
 +      inplace_qr_hybrid(MatrixType & A, std::size_t block_size = 16)
++=======
+       std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type >
+       inplace_qr_hybrid(MatrixType & A, vcl_size_t block_size = 16)
++>>>>>>> upstream/1.5.1
        {
          typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
  
          typedef viennacl::matrix_range<MatrixType>                    VCLMatrixRange;
          typedef boost::numeric::ublas::matrix<ScalarType>             UblasMatrixType;
          typedef boost::numeric::ublas::matrix_range<UblasMatrixType>  UblasMatrixRange;
 -
++<<<<<<< HEAD
 +        
 +        //using boost::numeric::ublas::range;
 +        //using boost::numeric::ublas::project;
 +        
++=======
++
++>>>>>>> upstream/1.5.1
          std::vector<ScalarType> betas(A.size2());
          UblasMatrixType v(A.size1(), 1);
          UblasMatrixType matrix_1x1(1,1);
  
          UblasMatrixType ublasW(A.size1(), block_size); ublasW.clear(); ublasW.resize(A.size1(), block_size);
          UblasMatrixType ublasY(A.size1(), block_size); ublasY.clear(); ublasY.resize(A.size1(), block_size);
++<<<<<<< HEAD
 +        
 +        UblasMatrixType ublasA(A.size1(), A.size1());
 +        
 +        MatrixType vclW(ublasW.size1(), ublasW.size2());
 +        MatrixType vclY(ublasY.size1(), ublasY.size2());
 +        
 +          
 +        //run over A in a block-wise manner:
 +        for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
 +        {
 +          std::size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
++=======
+ 
+         UblasMatrixType ublasA(A.size1(), A.size1());
+ 
+         MatrixType vclW(ublasW.size1(), ublasW.size2());
+         MatrixType vclY(ublasY.size1(), ublasY.size2());
+ 
+ 
+         //run over A in a block-wise manner:
+         for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+         {
+           vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
++>>>>>>> upstream/1.5.1
            UblasMatrixRange ublasA_part = boost::numeric::ublas::project(ublasA,
                                                                          boost::numeric::ublas::range(0, A.size1()),
                                                                          boost::numeric::ublas::range(j, j + effective_block_size));
@@@ -569,13 -456,13 +669,23 @@@
                                            viennacl::range(j, j+effective_block_size)),
                           ublasA_part
                          );
++<<<<<<< HEAD
 +          
 +          //determine Householder vectors:
 +          for (std::size_t k = 0; k < effective_block_size; ++k)
 +          {
 +            betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
 +            
 +            for (std::size_t l = k; l < effective_block_size; ++l)
++=======
+ 
+           //determine Householder vectors:
+           for (vcl_size_t k = 0; k < effective_block_size; ++k)
+           {
+             betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
+ 
+             for (vcl_size_t l = k; l < effective_block_size; ++l)
++>>>>>>> upstream/1.5.1
                detail::householder_reflect_ublas(ublasA, v, matrix_1x1, betas[j+k], j+k, j+l);
  
              detail::write_householder_to_A_ublas(ublasA, v, j+k);
@@@ -585,35 -472,35 +695,67 @@@
            // Setup Y:
            //
            ublasY.clear();  ublasY.resize(A.size1(), block_size);
++<<<<<<< HEAD
 +          for (std::size_t k = 0; k < effective_block_size; ++k)
 +          {
 +            //write Householder to Y:
 +            ublasY(j+k,k) = 1.0;
 +            boost::numeric::ublas::project(ublasY, 
 +                                           boost::numeric::ublas::range(j+k+1, A.size1()), 
 +                                           boost::numeric::ublas::range(k, k+1)) 
 +              = boost::numeric::ublas::project(ublasA, 
 +                                               boost::numeric::ublas::range(j+k+1, A.size1()),
 +                                               boost::numeric::ublas::range(j+k, j+k+1));
 +          }
 +          
 +          //
 +          // Setup W:
 +          //
 +          
 +          //first vector:
 +          ublasW.clear();  ublasW.resize(A.size1(), block_size);
 +          ublasW(j, 0) = -betas[j];
 +          boost::numeric::ublas::project(ublasW, 
 +                                        boost::numeric::ublas::range(j+1, A.size1()), 
 +                                        boost::numeric::ublas::range(0, 1)) 
 +            = -betas[j] * boost::numeric::ublas::project(ublasA, 
 +                                                          boost::numeric::ublas::range(j+1, A.size1()), 
 +                                                          boost::numeric::ublas::range(j, j+1));
 +          
 +          
 +          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
 +          for (std::size_t k = 1; k < effective_block_size; ++k)
++=======
+           for (vcl_size_t k = 0; k < effective_block_size; ++k)
+           {
+             //write Householder to Y:
+             ublasY(j+k,k) = 1.0;
+             boost::numeric::ublas::project(ublasY,
+                                            boost::numeric::ublas::range(j+k+1, A.size1()),
+                                            boost::numeric::ublas::range(k, k+1))
+               = boost::numeric::ublas::project(ublasA,
+                                                boost::numeric::ublas::range(j+k+1, A.size1()),
+                                                boost::numeric::ublas::range(j+k, j+k+1));
+           }
+ 
+           //
+           // Setup W:
+           //
+ 
+           //first vector:
+           ublasW.clear();  ublasW.resize(A.size1(), block_size);
+           ublasW(j, 0) = -betas[j];
+           boost::numeric::ublas::project(ublasW,
+                                         boost::numeric::ublas::range(j+1, A.size1()),
+                                         boost::numeric::ublas::range(0, 1))
+             = -betas[j] * boost::numeric::ublas::project(ublasA,
+                                                           boost::numeric::ublas::range(j+1, A.size1()),
+                                                           boost::numeric::ublas::range(j, j+1));
+ 
+ 
+           //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+           for (vcl_size_t k = 1; k < effective_block_size; ++k)
++>>>>>>> upstream/1.5.1
            {
              UblasMatrixRange Y_old = boost::numeric::ublas::project(ublasY,
                                                                      boost::numeric::ublas::range(j, A.size1()),
@@@ -621,46 -508,44 +763,85 @@@
              UblasMatrixRange v_k   = boost::numeric::ublas::project(ublasY,
                                                                      boost::numeric::ublas::range(j, A.size1()),
                                                                      boost::numeric::ublas::range(k, k+1));
++<<<<<<< HEAD
 +            UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW, 
 +                                                                    boost::numeric::ublas::range(j, A.size1()), 
 +                                                                    boost::numeric::ublas::range(0, k));
 +            UblasMatrixRange z     = boost::numeric::ublas::project(ublasW, 
 +                                                                    boost::numeric::ublas::range(j, A.size1()), 
 +                                                                    boost::numeric::ublas::range(k, k+1));
 +            
 +            UblasMatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
 +            z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
 +          }
 +          
 +          
++=======
+             UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW,
+                                                                     boost::numeric::ublas::range(j, A.size1()),
+                                                                     boost::numeric::ublas::range(0, k));
+             UblasMatrixRange z     = boost::numeric::ublas::project(ublasW,
+                                                                     boost::numeric::ublas::range(j, A.size1()),
+                                                                     boost::numeric::ublas::range(k, k+1));
+ 
+             UblasMatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
+             z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
+           }
+ 
+ 
++>>>>>>> upstream/1.5.1
  
            //
            //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
            //
++<<<<<<< HEAD
 +          
 +          VCLMatrixRange A_part = viennacl::project(A,
 +                                                    viennacl::range(0, A.size1()),
 +                                                    viennacl::range(j, j+effective_block_size));
 +          
++=======
+ 
+           VCLMatrixRange A_part = viennacl::project(A,
+                                                     viennacl::range(0, A.size1()),
+                                                     viennacl::range(j, j+effective_block_size));
+ 
++>>>>>>> upstream/1.5.1
            viennacl::copy(boost::numeric::ublas::project(ublasA,
                                                          boost::numeric::ublas::range(0, A.size1()),
                                                          boost::numeric::ublas::range(j, j+effective_block_size)),
                          A_part);
++<<<<<<< HEAD
 +          
 +          viennacl::copy(ublasW, vclW);
 +          viennacl::copy(ublasY, vclY);
 +          
 +          if (A.size2() - j - effective_block_size > 0)
 +          {
 +            
 +            VCLMatrixRange A_part(A, viennacl::range(j, A.size1()), viennacl::range(j+effective_block_size, A.size2()));
 +            VCLMatrixRange W_part(vclW, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size));
 +            MatrixType temp = viennacl::linalg::prod(trans(W_part), A_part);
 +            
 +            A_part += viennacl::linalg::prod(viennacl::project(vclY, 
 +                                             viennacl::range(j, A.size1()), 
 +                                             viennacl::range(0, vclY.size2())),
 +                          temp);
++=======
+ 
+           viennacl::copy(ublasW, vclW);
+           viennacl::copy(ublasY, vclY);
+ 
+           if (A.size2() > j + effective_block_size)
+           {
+ 
+             VCLMatrixRange A_part(A, viennacl::range(j, A.size1()), viennacl::range(j+effective_block_size, A.size2()));
+             VCLMatrixRange W_part(vclW, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size));
+             MatrixType temp = viennacl::linalg::prod(trans(W_part), A_part);
+ 
+             A_part += viennacl::linalg::prod(viennacl::project(vclY, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size)),
+                                              temp);
++>>>>>>> upstream/1.5.1
            }
          }
  
@@@ -670,10 -555,10 +851,11 @@@
  
  
      } //namespace detail
-         
+ 
+ 
  
  
 +
      //takes an inplace QR matrix A and generates Q and R explicitly
      template <typename MatrixType, typename VectorType>
      void recoverQ(MatrixType const & A, VectorType const & betas, MatrixType & Q, MatrixType & R)
@@@ -718,31 -597,66 +894,83 @@@
        }
      }
  
++<<<<<<< HEAD
 +    /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A 
 +     * 
++=======
+ 
+     /** @brief Computes Q^T b, where Q is an implicit orthogonal matrix defined via its Householder reflectors stored in A.
+      *
+      *  @param A      A matrix holding the Householder reflectors in the lower triangular part. Typically obtained from calling inplace_qr() on the original matrix
+      *  @param betas  The scalars beta_i for each Householder reflector (I - beta_i v_i v_i^T)
+      *  @param b      The vector b to which the result Q^T b is directly written to
+      */
+     template <typename MatrixType, typename VectorType1, typename VectorType2>
+     void inplace_qr_apply_trans_Q(MatrixType const & A, VectorType1 const & betas, VectorType2 & b)
+     {
+       typedef typename viennacl::result_of::cpu_value_type<typename MatrixType::value_type>::type   ScalarType;
+ 
+       //
+       // Apply Q^T = (I - beta_m v_m v_m^T) \times ... \times (I - beta_0 v_0 v_0^T) by applying all the Householder reflectors to b:
+       //
+       for (vcl_size_t col_index=0; col_index<std::min(A.size1(), A.size2()); ++col_index)
+       {
+         ScalarType v_in_b = b[col_index];
+         for (vcl_size_t i=col_index+1; i<A.size1(); ++i)
+           v_in_b += A(i, col_index) * b[i];
+ 
+         b[col_index] -= betas[col_index] * v_in_b;
+         for (vcl_size_t i=col_index+1; i<A.size1(); ++i)
+           b[i] -= betas[col_index] * A(i, col_index) * v_in_b;
+       }
+     }
+ 
+     template <typename T, typename F, unsigned int ALIGNMENT, typename VectorType1, unsigned int A2>
+     void inplace_qr_apply_trans_Q(viennacl::matrix<T, F, ALIGNMENT> const & A, VectorType1 const & betas, viennacl::vector<T, A2> & b)
+     {
+       boost::numeric::ublas::matrix<T> ublas_A(A.size1(), A.size2());
+       viennacl::copy(A, ublas_A);
+ 
+       std::vector<T> stl_b(b.size());
+       viennacl::copy(b, stl_b);
+ 
+       inplace_qr_apply_trans_Q(ublas_A, betas, stl_b);
+ 
+       viennacl::copy(stl_b, b);
+     }
+ 
+     /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A
+      *
++>>>>>>> upstream/1.5.1
       * @param A            A dense ViennaCL matrix to be factored
-      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+      * @param block_size   The block size to be used.
       */
      template<typename T, typename F, unsigned int ALIGNMENT>
-     std::vector<T> inplace_qr(viennacl::matrix<T, F, ALIGNMENT> & A, std::size_t block_size = 16)
+     std::vector<T> inplace_qr(viennacl::matrix<T, F, ALIGNMENT> & A, vcl_size_t block_size = 16)
      {
++<<<<<<< HEAD
 +      if (A.size2() % block_size != 0)
 +        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
 +      
++=======
++>>>>>>> upstream/1.5.1
        return detail::inplace_qr_hybrid(A, block_size);
      }
  
-     /** @brief Overload of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
-      * 
+     /** @brief Overload of inplace-QR factorization for a general Boost.uBLAS compatible matrix A
+      *
       * @param A            A dense compatible to Boost.uBLAS
-      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+      * @param block_size   The block size to be used.
       */
      template<typename MatrixType>
-     std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, std::size_t block_size = 16)
+     std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, vcl_size_t block_size = 16)
      {
++<<<<<<< HEAD
 +      if (A.size2() % block_size != 0)
 +        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
 +      
++=======
++>>>>>>> upstream/1.5.1
        return detail::inplace_qr_ublas(A, block_size);
      }
  
diff --cc viennacl/linalg/row_scaling.hpp
index ee3fd0c,8795fb8..6e93148
--- a/viennacl/linalg/row_scaling.hpp
+++ b/viennacl/linalg/row_scaling.hpp
@@@ -198,29 -184,12 +184,32 @@@ namespace viennac
          template <unsigned int ALIGNMENT>
          void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
          {
++<<<<<<< HEAD
 +          assert(viennacl::traits::size1(system_matrix) == viennacl::traits::size(vec));
 +          
 +          //run kernel:
 +          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
 +                                                                "diag_precond");
 +
 +          viennacl::ocl::enqueue(
 +             k(viennacl::traits::handle(diag_M_inv),
 +                cl_uint(viennacl::traits::start(diag_M_inv)),
 +                cl_uint(viennacl::traits::stride(diag_M_inv)),
 +                cl_uint(viennacl::traits::size(diag_M_inv)),
 +               viennacl::traits::handle(vec),
 +                cl_uint(viennacl::traits::start(vec)),
 +                cl_uint(viennacl::traits::stride(vec)),
 +                cl_uint(viennacl::traits::size(vec)) )
 +                                );        
 +          
++=======
+           assert(viennacl::traits::size(diag_M) == viennacl::traits::size(vec) && bool("Size mismatch"));
+           vec = element_div(vec, diag_M);
++>>>>>>> upstream/1.5.1
          }
-         
+ 
        private:
-         MatrixType const & system_matrix;
-         row_scaling_tag const & tag_;
-         viennacl::vector<ScalarType> diag_M_inv;
+         viennacl::vector<ScalarType> diag_M;
      };
  
    }
diff --cc viennacl/linalg/svd.hpp
index db04702,3f07411..405b1ef
--- a/viennacl/linalg/svd.hpp
+++ b/viennacl/linalg/svd.hpp
@@@ -2,24 -2,25 +2,40 @@@
  #define VIENNACL_LINALG_SVD_HPP
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
  ============================================================================= */
  
  /** @file viennacl/linalg/svd.hpp
++<<<<<<< HEAD
 +    @brief Provides singular value decomposition using a block-based approach.  Experimental in 1.3.x.
 +    
++=======
+     @brief Provides singular value decomposition using a block-based approach.  Experimental.
+ 
++>>>>>>> upstream/1.5.1
      Contributed by Volodymyr Kysenko.
  */
  
@@@ -32,98 -33,33 +48,128 @@@
  #include <cmath>
  
  #include "viennacl/matrix.hpp"
++<<<<<<< HEAD
 +#include "viennacl/linalg/kernels/svd_kernels.h"
 +
 +namespace viennacl 
 +{
 +  namespace linalg 
 +  {
 +  
 +    //const std::string SVD_KERNELS_FOLDER = "../../non-release/svd-kernels/";
 +    //const std::string SVD_BIDIAG_PROGRAM = "bidiag.cl";
 +
 +    const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
 +    const std::string SVD_HOUSEHOLDER_COL_KERNEL = "house_col";
 +    const std::string SVD_HOUSEHOLDER_ROW_KERNEL = "house_row";
 +    const std::string SVD_COPY_COL_KERNEL = "copy_col";
 +    const std::string SVD_COPY_ROW_KERNEL = "copy_row";
 +    const std::string SVD_MATRIX_TRANSPOSE_KERNEL = "transpose_inplace";
 +    const std::string SVD_INVERSE_SIGNS_KERNEL = "inverse_signs";
 +    const std::string SVD_GIVENS_PREV_KERNEL = "givens_prev";
 +    
 +    namespace detail 
 +    {
 +      static const float EPS = 0.00001f;
 +      static const std::size_t ITER_MAX = 50;
 +
 +      inline float pythag(float a, float b) 
 +      {
 +        float absa = std::abs(a);
 +        float absb = std::abs(b);
 +
 +        if(absa > absb) {
 +          return absa * sqrt(1.0f + pow(absb / absa, 2));
 +        } else {
 +          return absb * sqrt(1.0f + pow(absa / absb, 2));
 +        }
 +      }
 +
 +      inline float sign(float val) 
 +      {
 +          return val >= 0.0f ? 1.0f : -1.0f;
 +      }
 +
 +      inline float norm_lcl(std::vector<float>& x, unsigned int size) 
 +      {
 +        float x_norm = 0.0;
 +        for(std::size_t i = 0; i < size; i++) x_norm += std::pow(x[i], 2);
 +        x_norm = std::sqrt(x_norm);
 +        return x_norm;
 +      }
 +
 +      template <typename T>
 +      void normalize(std::vector<T>& x, unsigned int size) 
 +      {
 +        float x_norm = norm_lcl(x, size);
 +        for(std::size_t i = 0; i < size; i++) {
 +            x[i] /= x_norm;
 +        }
 +      }
 +
 +      template <typename T>
 +      void householder_vector(std::vector<T> & v, unsigned int start)
 +      {
 +        float x_norm = norm_lcl(v, v.size());
 +        float alpha = -sign(v[start]) * x_norm;
 +        v[start] += alpha;
 +        normalize(v, v.size());
 +      }
 +
 +      template <typename MatrixType>
 +      void transpose(MatrixType& A)
 +      {
 +
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
 +
 +        viennacl::ocl::enqueue(kernel(
 +                                      A,
 +                                      static_cast<cl_uint>(A.internal_size1()),
 +                                      static_cast<cl_uint>(A.internal_size2())
 +                              ));
 +      }
 +
 +      template<typename MatrixType, typename VectorType>
 +      void givens_prev(MatrixType& matrix,
 +                        VectorType& tmp1,
 +                        VectorType& tmp2,
 +                        int n,
 +                        int l,
 +                        int k
 +                      )
 +      {
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_GIVENS_PREV_KERNEL);
 +
 +        kernel.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(matrix), 256));
++=======
+ #include "viennacl/linalg/opencl/kernels/svd.hpp"
+ #include "viennacl/linalg/qr-method-common.hpp"
+ 
+ namespace viennacl
+ {
+   namespace linalg
+   {
+ 
+     namespace detail
+     {
+ 
+       template<typename MatrixType, typename VectorType>
+       void givens_prev(MatrixType & matrix,
+                        VectorType & tmp1,
+                        VectorType & tmp2,
+                        int n,
+                        int l,
+                        int k
+                       )
+       {
+         typedef typename MatrixType::value_type                                   ScalarType;
+         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
+         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+         viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_GIVENS_PREV_KERNEL);
+ 
+         kernel.global_work_size(0, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size1(matrix), 256));
++>>>>>>> upstream/1.5.1
          kernel.local_work_size(0, 256);
  
          viennacl::ocl::enqueue(kernel(
@@@ -141,10 -77,14 +187,21 @@@
        template<typename MatrixType, typename VectorType>
        void change_signs(MatrixType& matrix, VectorType& signs, int n)
        {
++<<<<<<< HEAD
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_INVERSE_SIGNS_KERNEL);
 +
 +        kernel.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(matrix), 16));
 +        kernel.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(matrix), 16));
++=======
+         typedef typename MatrixType::value_type                                   ScalarType;
+         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
+         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+         viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_INVERSE_SIGNS_KERNEL);
+ 
+         kernel.global_work_size(0, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size1(matrix), 16));
+         kernel.global_work_size(1, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size2(matrix), 16));
++>>>>>>> upstream/1.5.1
  
          kernel.local_work_size(0, 16);
          kernel.local_work_size(1, 16);
@@@ -157,64 -97,75 +214,131 @@@
                                ));
        }
  
++<<<<<<< HEAD
 +      template<typename MatrixType>
 +      void svd_qr_shift(MatrixType& vcl_u,
 +                        MatrixType& vcl_v,
 +                        boost::numeric::ublas::vector<float> &q, 
 +                        boost::numeric::ublas::vector<float> &e)
 +      {
 +        int n = q.size();
 +        int m = vcl_u.size1();
++=======
+       template<typename MatrixType, typename CPU_VectorType>
+       void svd_qr_shift(MatrixType & vcl_u,
+                         MatrixType & vcl_v,
+                         CPU_VectorType & q,
+                         CPU_VectorType & e)
+       {
+         typedef typename MatrixType::value_type                                   ScalarType;
+         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+ 
+         int n = static_cast<int>(q.size());
+         int m = static_cast<int>(vcl_u.size1());
++>>>>>>> upstream/1.5.1
  
          detail::transpose(vcl_u);
          detail::transpose(vcl_v);
  
++<<<<<<< HEAD
 +        std::vector<float> signs_v(n, 1.0f);
 +        std::vector<float> cs1(n), ss1(n), cs2(n), ss2(n);
 +        
 +        viennacl::vector<float> tmp1(n), tmp2(n);
 +
 +        bool goto_test_conv = false;
 +
 +        for (int k = n - 1; k >= 0; k--) {
 +          // std::cout << "K = " << k << std::endl;
 +
 +          std::size_t iter = 0;
 +          for (iter = 0; iter < detail::ITER_MAX; iter++) {
 +            // test for split
 +            int l;
 +            for (l = k; l >= 0; l--) {
 +              goto_test_conv = false;
 +              if (fabs(e[l]) <= detail::EPS) {
++=======
+         std::vector<CPU_ScalarType> signs_v(n, 1);
+         std::vector<CPU_ScalarType> cs1(n), ss1(n), cs2(n), ss2(n);
+ 
+         viennacl::vector<CPU_ScalarType> tmp1(n), tmp2(n);
+ 
+         bool goto_test_conv = false;
+ 
+         for (int k = n - 1; k >= 0; k--)
+         {
+           // std::cout << "K = " << k << std::endl;
+ 
+           vcl_size_t iter = 0;
+           for (iter = 0; iter < detail::ITER_MAX; iter++)
+           {
+             // test for split
+             int l;
+             for (l = k; l >= 0; l--)
+             {
+               goto_test_conv = false;
+               if (std::fabs(e[l]) <= detail::EPS)
+               {
++>>>>>>> upstream/1.5.1
                  // set it
                  goto_test_conv = true;
                  break;
                }
  
++<<<<<<< HEAD
 +              if (fabs(q[l - 1]) <= detail::EPS) {
++=======
+               if (std::fabs(q[l - 1]) <= detail::EPS)
+               {
++>>>>>>> upstream/1.5.1
                  // goto
                  break;
                }
              }
  
++<<<<<<< HEAD
 +            if (!goto_test_conv) {
 +              float c = 0.0;
 +              float s = 1.0;
 +
 +              //int l1 = l - 1;
 +              int l2 = k;
 +
 +              for (int i = l; i <= k; i++) {
 +                float f = s * e[i];
 +                e[i] = c * e[i];
 +
 +                if (fabs(f) <= detail::EPS) {
 +                  l2 = i - 1;
 +                  break;
 +                }
 +
 +                float g = q[i];
 +                float h = detail::pythag(f, g);
++=======
+             if (!goto_test_conv)
+             {
+               CPU_ScalarType c = 0.0;
+               CPU_ScalarType s = 1.0;
+ 
+               //int l1 = l - 1;
+               //int l2 = k;
+ 
+               for (int i = l; i <= k; i++)
+               {
+                 CPU_ScalarType f = s * e[i];
+                 e[i] = c * e[i];
+ 
+                 if (std::fabs(f) <= detail::EPS)
+                 {
+                   //l2 = i - 1;
+                   break;
+                 }
+ 
+                 CPU_ScalarType g = q[i];
+                 CPU_ScalarType h = detail::pythag(f, g);
++>>>>>>> upstream/1.5.1
                  q[i] = h;
                  c = g / h;
                  s = -f / h;
@@@ -225,41 -176,42 +349,75 @@@
  
                // std::cout << "Hitted!" << l1 << " " << l2 << "\n";
  
++<<<<<<< HEAD
 +              // for(int i = l; i <= l2; i++) 
 +              // {
 +              //   for (int j = 0; j < m; j++) 
 +              //   {
 +              //     float y = u(j, l1);
 +              //     float z = u(j, i);
++=======
+               // for(int i = l; i <= l2; i++)
+               // {
+               //   for (int j = 0; j < m; j++)
+               //   {
+               //     CPU_ScalarType y = u(j, l1);
+               //     CPU_ScalarType z = u(j, i);
++>>>>>>> upstream/1.5.1
                //     u(j, l1) = y * cs1[i] + z * ss1[i];
                //     u(j, i) = -y * ss1[i] + z * cs1[i];
                //   }
                // }
              }
  
++<<<<<<< HEAD
 +            float z = q[k];
 +
 +            if (l == k) {
 +              if (z < 0.0f) {
 +                q[k] = -z;
 +
 +                signs_v[k] *= -1.0f;
++=======
+             CPU_ScalarType z = q[k];
+ 
+             if (l == k)
+             {
+               if (z < 0)
+               {
+                 q[k] = -z;
+ 
+                 signs_v[k] *= -1;
++>>>>>>> upstream/1.5.1
                }
  
                break;
              }
  
++<<<<<<< HEAD
 +            if (iter >= detail::ITER_MAX - 1) {
 +              break;
 +            }
 +
 +            float x = q[l];
 +            float y = q[k - 1];
 +            float g = e[k - 1];
 +            float h = e[k];
 +            float f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
 +            
 +            g = detail::pythag(f, 1.0);
++=======
+             if (iter >= detail::ITER_MAX - 1)
+               break;
+ 
+             CPU_ScalarType x = q[l];
+             CPU_ScalarType y = q[k - 1];
+             CPU_ScalarType g = e[k - 1];
+             CPU_ScalarType h = e[k];
+             CPU_ScalarType f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2 * h * y);
+ 
+             g = detail::pythag<CPU_ScalarType>(f, 1);
++>>>>>>> upstream/1.5.1
  
              if (f < 0) {
                f = ((x - z) * (x + z) + h * (y / (f - g) - h)) / x;
@@@ -267,16 -219,16 +425,27 @@@
                f = ((x - z) * (x + z) + h * (y / (f + g) - h)) / x;
              }
  
++<<<<<<< HEAD
 +            float c = 1.0;
 +            float s = 1.0;
 +
 +            for (std::size_t i = l + 1; i <= static_cast<std::size_t>(k); i++) 
++=======
+             CPU_ScalarType c = 1;
+             CPU_ScalarType s = 1;
+ 
+             for (vcl_size_t i = l + 1; i <= static_cast<vcl_size_t>(k); i++)
++>>>>>>> upstream/1.5.1
              {
                g = e[i];
                y = q[i];
                h = s * g;
                g = c * g;
++<<<<<<< HEAD
 +              float z = detail::pythag(f, h);
++=======
+               CPU_ScalarType z = detail::pythag(f, h);
++>>>>>>> upstream/1.5.1
                e[i - 1] = z;
                c = f / z;
                s = h / z;
@@@ -284,7 -236,7 +453,11 @@@
                g = -x * s + g * c;
                h = y * s;
                y = y * c;
++<<<<<<< HEAD
 +              
++=======
+ 
++>>>>>>> upstream/1.5.1
                cs1[i] = c;
                ss1[i] = s;
  
@@@ -298,7 -250,7 +471,11 @@@
                cs2[i] = c;
                ss2[i] = s;
              }
++<<<<<<< HEAD
 +            
++=======
+ 
++>>>>>>> upstream/1.5.1
              {
                viennacl::copy(cs1, tmp1);
                viennacl::copy(ss1, tmp2);
@@@ -312,7 -264,7 +489,11 @@@
  
                givens_prev(vcl_u, tmp1, tmp2, m, l, k);
              }
++<<<<<<< HEAD
 +            
++=======
+ 
++>>>>>>> upstream/1.5.1
              e[l] = 0.0;
              e[k] = f;
              q[k] = x;
@@@ -320,7 -272,7 +501,11 @@@
  
          }
  
++<<<<<<< HEAD
 +        
++=======
+ 
++>>>>>>> upstream/1.5.1
          viennacl::copy(signs_v, tmp1);
          change_signs(vcl_v, tmp1, n);
  
@@@ -329,66 -281,30 +514,91 @@@
          detail::transpose(vcl_v);
        }
  
++<<<<<<< HEAD
 +      template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +      void eye(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A)
 +      {
 +      
 +        std::vector<SCALARTYPE> foo(A.size1() * A.size1(), 0);
 +        
 +        for(std::size_t i = 0; i < A.size1(); i++)
 +        {
 +          foo[i*A.size1() + i] = 1;
 +        }
 +
 +        viennacl::fast_copy(&foo[0], &foo[0] + foo.size(), A);
 +      }
 +      
 +      template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +      void copy_vec(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
 +                    viennacl::vector<SCALARTYPE, ALIGNMENT>& V,
 +                    std::size_t row_start, 
 +                    std::size_t col_start, 
 +                    bool copy_col
 +      )
 +      {
 +
 +        std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : SVD_COPY_ROW_KERNEL;
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(),
 +                                                                  kernel_name);
 +
 +        viennacl::ocl::enqueue(kernel(
 +                                      A, 
 +                                      V, 
 +                                      static_cast<cl_uint>(row_start), 
 +                                      static_cast<cl_uint>(col_start),
 +                                      copy_col ? static_cast<cl_uint>(A.size1())
 +                                               : static_cast<cl_uint>(A.size2()),
 +                                      static_cast<cl_uint>(A.internal_size2())
 +                              ));
 +
 +      }
 +
 +      template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +      bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
 +                          viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
 +                          viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
 +                          std::size_t start) 
 +      {
 +
 +        std::size_t row_start = start, col_start = start;
 +
 +        if(row_start + 1 >= A.size1()) 
 +          return false;
 +
 +        std::vector<float> tmp(A.size1(), 0);
++=======
+ 
+       /*template <typename SCALARTYPE, unsigned int ALIGNMENT>
+       bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
+                           viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q,
+                           viennacl::vector<SCALARTYPE, ALIGNMENT> & D,
+                           vcl_size_t start)
+       {
+ 
+         vcl_size_t row_start = start;
+         vcl_size_t col_start = start;
+ 
+         if(row_start + 1 >= A.size1())
+           return false;
+ 
+         std::vector<SCALARTYPE> tmp(A.size1(), 0);
++>>>>>>> upstream/1.5.1
  
          copy_vec(A, D, row_start, col_start, true);
          fast_copy(D.begin(), D.begin() + (A.size1() - row_start), tmp.begin() + row_start);
  
          detail::householder_vector(tmp, row_start);
++<<<<<<< HEAD
 +        fast_copy(tmp, D);
 +
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_HOUSEHOLDER_COL_KERNEL);
++=======
+ 
+         fast_copy(tmp, D);
+ 
+         viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_COL_KERNEL);
++>>>>>>> upstream/1.5.1
  
          //kernel.global_work_size(0, A.size1() << 1);
  
@@@ -402,32 -318,81 +612,108 @@@
                                        static_cast<cl_uint>(A.size2()),
                                        static_cast<cl_uint>(A.internal_size2()),
                                        static_cast<cl_uint>(Q.internal_size2()),
++<<<<<<< HEAD
 +                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
 +                              ));
 +
 +        return true;
 +      }
 +
++=======
+                                       viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                               ));
+ 
+         return true;
+       }*/
+ 
+       template <typename SCALARTYPE, unsigned int ALIGNMENT>
+       bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                           viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
+                           viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
+                           vcl_size_t row_start, vcl_size_t col_start)
+       {
+         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ 
+         if(row_start + 1 >= A.size1())
+           return false;
+ 
+         prepare_householder_vector(A, D, A.size1(), row_start, col_start, row_start, true);
+ 
+         {
+           viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+ 
+           viennacl::ocl::enqueue(kernel(
+                                         A,
+                                         D,
+                                         static_cast<cl_uint>(row_start),
+                                         static_cast<cl_uint>(col_start),
+                                         static_cast<cl_uint>(A.size1()),
+                                         static_cast<cl_uint>(A.size2()),
+                                         static_cast<cl_uint>(A.internal_size2()),
+                                         viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                 ));
+         }
+ 
+         {
+           viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+ 
+           viennacl::ocl::enqueue(kernel(
+                                         Q,
+                                         D,
+                                         static_cast<cl_uint>(A.size1()),
+                                         static_cast<cl_uint>(A.size2()),
+                                         static_cast<cl_uint>(Q.internal_size2()),
+                                         viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                 ));
+         }
+ 
+         return true;
+       }
+ 
+       /*
++>>>>>>> upstream/1.5.1
        template <typename SCALARTYPE, unsigned int ALIGNMENT>
        bool householder_r(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
                            viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
                            viennacl::vector<SCALARTYPE, ALIGNMENT>& S,
++<<<<<<< HEAD
 +                          std::size_t start)
 +      {
 +      
 +        std::size_t row_start = start, col_start = start + 1;
 +        if(col_start + 1 >= A.size2()) 
 +          return false;
 +
 +        std::vector<float> tmp(A.size2(), 0);
 +
 +        copy_vec(A, S, row_start, col_start, false);
 +        fast_copy(S.begin(), S.begin() + (A.size2() - col_start), tmp.begin() + col_start);
++=======
+                           vcl_size_t start)
+       {
+ 
+         vcl_size_t row_start = start;
+         vcl_size_t col_start = start + 1;
+ 
+         if(col_start + 1 >= A.size2())
+           return false;
+ 
+         std::vector<SCALARTYPE> tmp(A.size2(), 0);
+ 
+         copy_vec(A, S, row_start, col_start, false);
+         fast_copy(S.begin(),
+                   S.begin() + (A.size2() - col_start),
+                   tmp.begin() + col_start);
++>>>>>>> upstream/1.5.1
  
          detail::householder_vector(tmp, col_start);
          fast_copy(tmp, S);
  
++<<<<<<< HEAD
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_HOUSEHOLDER_ROW_KERNEL);
++=======
+         viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_ROW_KERNEL);
++>>>>>>> upstream/1.5.1
  
          viennacl::ocl::enqueue(kernel(
                                        A,
@@@ -439,101 -404,126 +725,219 @@@
                                        static_cast<cl_uint>(A.size2()),
                                        static_cast<cl_uint>(A.internal_size2()),
                                        static_cast<cl_uint>(Q.internal_size2()),
++<<<<<<< HEAD
 +                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
 +                                ));
 +        return true;
 +      }
 +
 +      template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +      void bidiag_pack(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
 +                        viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
 +                        viennacl::vector<SCALARTYPE, ALIGNMENT>& S
 +                      )
 +      {
 +        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_BIDIAG_PACK_KERNEL);
 +
 +        viennacl::ocl::enqueue(kernel(
 +                                      A, 
 +                                      D, 
 +                                      S,
 +                                      static_cast<cl_uint>(A.size1()), 
 +                                      static_cast<cl_uint>(A.size2()),
 +                                      static_cast<cl_uint>(A.internal_size2())
 +                                    ));
 +      }
 +
 +      template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +      void bidiag(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Ai,
 +                  viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& QL,
 +                  viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& QR)
 +      {
 +        std::size_t row_num = Ai.size1();
 +        std::size_t col_num = Ai.size2();
 +
 +        std::size_t to = std::min(row_num, col_num);
 +        std::size_t big_to = std::max(row_num, col_num);
++=======
+                                       viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                 ));
+         return true;
+       } */
+ 
+       template <typename SCALARTYPE, unsigned int ALIGNMENT>
+       bool householder_r(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
+                           viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q,
+                           viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
+                           vcl_size_t row_start, vcl_size_t col_start)
+       {
+         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ 
+         if(col_start + 1 >= A.size2())
+           return false;
+ 
+         prepare_householder_vector(A, D, A.size2(), row_start, col_start, col_start, false);
+ 
+         {
+           viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+ 
+           viennacl::ocl::enqueue(kernel(
+                                         A,
+                                         D,
+                                         static_cast<cl_uint>(row_start),
+                                         static_cast<cl_uint>(col_start),
+                                         static_cast<cl_uint>(A.size1()),
+                                         static_cast<cl_uint>(A.size2()),
+                                         static_cast<cl_uint>(A.internal_size2()),
+                                         viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                 ));
+         }
+ 
+         {
+           viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QR_KERNEL);
+ 
+           viennacl::ocl::enqueue(kernel(
+                                         Q,
+                                         D,
+                                         static_cast<cl_uint>(A.size1()),
+                                         static_cast<cl_uint>(A.size2()),
+                                         static_cast<cl_uint>(Q.internal_size2()),
+                                         viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                 ));
+         }
+ 
+         return true;
+       }
+ 
+       template <typename SCALARTYPE, unsigned int ALIGNMENT>
+       void bidiag(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Ai,
+                   viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QL,
+                   viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QR)
+       {
+         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(QL).context());
+ 
+         vcl_size_t row_num = Ai.size1();
+         vcl_size_t col_num = Ai.size2();
+ 
+         vcl_size_t to = std::min(row_num, col_num);
+         vcl_size_t big_to = std::max(row_num, col_num);
++>>>>>>> upstream/1.5.1
  
          //for storing householder vector
          viennacl::vector<SCALARTYPE, ALIGNMENT> hh_vector(big_to);
  
++<<<<<<< HEAD
 +        eye(QL);
 +        eye(QR);
 +
 +        for(std::size_t i = 0; i < to; i++) 
 +        {
 +          householder_c(Ai, QL, hh_vector, i);
 +          householder_r(Ai, QR, hh_vector, i);
++=======
+         QL = viennacl::identity_matrix<SCALARTYPE>(QL.size1(), ctx);
+         QR = viennacl::identity_matrix<SCALARTYPE>(QR.size1(), ctx);
+ 
+         for(vcl_size_t i = 0; i < to; i++)
+         {
+           householder_c(Ai, QL, hh_vector, i, i);
+           householder_r(Ai, QR, hh_vector, i, i+1);
++>>>>>>> upstream/1.5.1
          }
        }
  
      } // namespace detail
  
  
++<<<<<<< HEAD
 +    /** @brief Computes the singular value decomposition of a matrix A. Experimental - works for single precision (float) only. Experimental in 1.3.x
 +     * 
++=======
+     /** @brief Computes the singular value decomposition of a matrix A. Experimental in 1.3.x
+      *
++>>>>>>> upstream/1.5.1
       * @param A     The input matrix. Will be overwritten with a diagonal matrix containing the singular values on return
       * @param QL    The left orthogonal matrix
       * @param QR    The right orthogonal matrix
       */
++<<<<<<< HEAD
 +    template <unsigned int ALIGNMENT>
 +    void svd(viennacl::matrix<float, row_major, ALIGNMENT> & A,
 +              viennacl::matrix<float, row_major, ALIGNMENT> & QL,
 +              viennacl::matrix<float, row_major, ALIGNMENT> & QR) 
 +    {
 +      typedef float SCALARTYPE;
 +      
 +      viennacl::linalg::kernels::svd<SCALARTYPE, 1>::init();
 +
 +      std::size_t row_num = A.size1();
 +      std::size_t col_num = A.size2();
 +
 +      std::size_t to = std::min(row_num, col_num);
 +
 +
 +      viennacl::vector<SCALARTYPE, ALIGNMENT> d(to);
 +      viennacl::vector<SCALARTYPE, ALIGNMENT> s(to + 1);
 +      
 +      // first stage
 +      detail::bidiag(A, QL, QR);
 +      detail::bidiag_pack(A, d, s);
 +
 +      // second stage
 +      boost::numeric::ublas::vector<SCALARTYPE> dh(to, 0.0f);
 +      boost::numeric::ublas::vector<SCALARTYPE> sh(to + 1, 0.0f);
 +
 +      boost::numeric::ublas::matrix<float> h_U(row_num, row_num);
 +      boost::numeric::ublas::matrix<float> h_V(col_num, col_num);
 +
 +      fast_copy(d, dh);
 +      fast_copy(s, sh);
 +
 +      detail::svd_qr_shift( QL, QR, dh, sh);
 +
 +      boost::numeric::ublas::matrix<float> h_Sigma(row_num, col_num);
 +      h_Sigma.clear();
 +
 +      for (std::size_t i = 0; i < to; i++)
 +        h_Sigma(i, i) = dh(i);
++=======
+     template <typename SCALARTYPE, unsigned int ALIGNMENT>
+     void svd(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
+               viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QL,
+               viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QR)
+     {
+       viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+       viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::init(ctx);
+ 
+       vcl_size_t row_num = A.size1();
+       vcl_size_t col_num = A.size2();
+ 
+       vcl_size_t to = std::min(row_num, col_num);
+ 
+ 
+       //viennacl::vector<SCALARTYPE, ALIGNMENT> d(to);
+       //viennacl::vector<SCALARTYPE, ALIGNMENT> s(to + 1);
+ 
+       // first stage
+       detail::bidiag(A, QL, QR);
+ 
+       // second stage
+       //std::vector<SCALARTYPE> dh(to, 0);
+       //std::vector<SCALARTYPE> sh(to + 1, 0);
+       boost::numeric::ublas::vector<SCALARTYPE> dh = boost::numeric::ublas::scalar_vector<SCALARTYPE>(to, 0);
+       boost::numeric::ublas::vector<SCALARTYPE> sh = boost::numeric::ublas::scalar_vector<SCALARTYPE>(to + 1, 0);
+ 
+       detail::bidiag_pack(A, dh, sh);
+ 
+       detail::svd_qr_shift( QL, QR, dh, sh);
+ 
+       // Write resulting diagonal matrix with singular values to A:
+       boost::numeric::ublas::matrix<SCALARTYPE> h_Sigma(row_num, col_num);
+       h_Sigma.clear();
+ 
+       for (vcl_size_t i = 0; i < to; i++)
+         h_Sigma(i, i) = dh[i];
++>>>>>>> upstream/1.5.1
  
        copy(h_Sigma, A);
      }
diff --cc viennacl/linalg/vector_operations.hpp
index dc214c4,9b6eb51..30832ff
--- a/viennacl/linalg/vector_operations.hpp
+++ b/viennacl/linalg/vector_operations.hpp
@@@ -34,864 -31,632 +31,1192 @@@
  #include "viennacl/traits/start.hpp"
  #include "viennacl/traits/handle.hpp"
  #include "viennacl/traits/stride.hpp"
++<<<<<<< HEAD
++=======
+ #include "viennacl/linalg/host_based/vector_operations.hpp"
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+   #include "viennacl/linalg/opencl/vector_operations.hpp"
+ #endif
+ 
+ #ifdef VIENNACL_WITH_CUDA
+   #include "viennacl/linalg/cuda/vector_operations.hpp"
+ #endif
++>>>>>>> upstream/1.5.1
  
  namespace viennacl
  {
    namespace linalg
    {
++<<<<<<< HEAD
 +    /** @brief Assign a vector (-range/-slice) to another vector (-range/slice).
 +    *
 +    * Computes vec1 += vec2.
 +    * 
 +    * @param vec1  The result. 
 +    * @param vec2  The addend
 +    */
 +    template <typename V1, typename V2>
 +    typename viennacl::enable_if< viennacl::is_vector<V1>::value
 +                                  && viennacl::is_vector<V2>::value
 +                                >::type
 +    assign(V1 & vec1,
 +           const V2 & vec2)
 +    {
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_add()!");
 +      
 +      
 +      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "assign");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)))
 +                            );
 +    }
 +    
 +    /** @brief Addition of two vectors.
 +    *
 +    * @param vec1  The first addend. 
 +    * @param vec2  The second addend.
 +    * @param result The result vector.
 +    */
 +    template <typename V1, typename V2, typename V3>
 +    typename viennacl::enable_if< viennacl::is_vector<V1>::value
 +                                  && viennacl::is_vector<V2>::value
 +                                  && viennacl::is_vector<V3>::value
 +                                >::type
 +    add(const V1 & vec1, 
 +        const V2 & vec2, 
 +        V3 & result)
 +    {
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in add()!");
 +
 +      //unsigned int size = std::min(viennacl::traits::internal_size(vec1),
 +      //                             viennacl::traits::internal_size(vec2));
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "add");
 +      
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)),
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)) )
 +                            );
++=======
+     template <typename T, typename ScalarType1>
+     void av(vector_base<T> & vec1,
+             vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+     {
+       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha: size(v1) != size(v2)"));
+ 
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::av(vec1, vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::av(vec1, vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::av(vec1, vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Inplace addition of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes vec1 += vec2.
-     * 
-     * @param vec1  The result. 
-     * @param vec2  The addend
-     */
-     template <typename V1, typename V2>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                 >::type
-     inplace_add(V1 & vec1,
-                 const V2 & vec2)
+ 
+     template <typename T, typename ScalarType1, typename ScalarType2>
+     void avbv(vector_base<T> & vec1,
+               vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+               vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_add()!");
 +      
 +      
 +      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_add");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)))
 +                            );
++=======
+       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
+       assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
+ 
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::avbv(vec1,
+                                                   vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                   vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::avbv(vec1,
+                                          vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                          vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::avbv(vec1,
+                                        vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                        vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
+     template <typename T, typename ScalarType1, typename ScalarType2>
+     void avbv_v(vector_base<T> & vec1,
+                 vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                 vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+     {
+       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 += v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
+       assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 += v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
  
-     /** @brief Subtraction of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * result = vec1 - vec2
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::avbv_v(vec1,
+                                                     vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                     vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::avbv_v(vec1,
+                                            vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                            vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::avbv_v(vec1,
+                                          vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                          vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
+     }
+ 
+ 
+     /** @brief Assign a constant value to a vector (-range/-slice)
      *
-     * @param vec1  The first operand. 
-     * @param vec2  The second operand.
-     * @param result The result vector.
+     * @param vec1   The vector to which the value should be assigned
+     * @param alpha  The value to be assigned
+     * @param up_to_internal_size    Whether 'alpha' should be written to padded memory as well. This is used for setting all entries to zero, including padded memory.
      */
-     template <typename V1, typename V2, typename V3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_vector<V3>::value
-                                 >::type
-     sub(const V1 & vec1,
-         const V2 & vec2,
-         V3 & result)
+     template <typename T>
+     void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in sub()!");
 +      
 +      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sub");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)),
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)) )
 +                            );        
++=======
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::vector_assign(vec1, alpha, up_to_internal_size);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::vector_assign(vec1, alpha, up_to_internal_size);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::vector_assign(vec1, alpha, up_to_internal_size);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Inplace addition of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
+ 
+     /** @brief Swaps the contents of two vectors, data is copied
      *
-     * Computes vec1 -= vec2.
-     * 
-     * @param vec1  The result. 
-     * @param vec2  The subtracted vector
+     * @param vec1   The first vector (or -range, or -slice)
+     * @param vec2   The second vector (or -range, or -slice)
      */
-     template <typename V1, typename V2>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                 >::type
-     inplace_sub(V1 & vec1,
-                 const V2 & vec2)
+     template <typename T>
+     void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_sub()!");
 +      
 +      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_sub");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)))
 +                            );        
++=======
+       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in vector_swap()"));
+ 
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::vector_swap(vec1, vec2);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::vector_swap(vec1, vec2);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::vector_swap(vec1, vec2);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
-     //result = vec * scalar
-     /** @brief Scales a vector. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result = vec * alpha, where alpha is a gpu scalar
+     ///////////////////////// Elementwise operations /////////////
+ 
+ 
+ 
+     /** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
      *
-     * @param vec    The vector to be scaled.
-     * @param alpha  The scaling factor.
-     * @param result The result vector.
+     * @param vec1   The result vector (or -range, or -slice)
+     * @param proxy  The proxy object holding v2, v3 and the operation
      */
-     template <typename V1, typename S2, typename V3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_scalar<S2>::value
-                                   && viennacl::is_vector<V3>::value
-                                 >::type
-     mult(const V1 & vec,
-          S2 const & alpha,
-          V3 & result)
+     template <typename T, typename OP>
+     void element_op(vector_base<T> & vec1,
+                     vector_expression<const vector_base<T>, const vector_base<T>, OP> const & proxy)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in mult()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mult");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),
 +                               alpha,
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)))
 +                            );        
++=======
+       assert(viennacl::traits::size(vec1) == viennacl::traits::size(proxy) && bool("Incompatible vector sizes in element_op()"));
+ 
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::element_op(vec1, proxy);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::element_op(vec1, proxy);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::element_op(vec1, proxy);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Scales a vector. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result = vec * alpha, where alpha is a cpu scalar
-     *
-     * @param vec    The vector to be scaled.
-     * @param alpha  The scaling factor.
-     * @param result The result vector.
-     */
-     template <typename V1, typename SCALARTYPE, typename V3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                   && viennacl::is_vector<V3>::value
-                                 >::type
-     mult(V1 const & vec,
-          SCALARTYPE alpha,
-          V3 & result)
+     /** \cond */
+ 
+ // Helper macro for generating binary element-wise operations such as element_prod(), element_div(), element_pow() without unnecessary code duplication */
+ #define VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(OPNAME) \
+     template <typename T> \
+     viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_##OPNAME> > \
+     element_##OPNAME(vector_base<T> const & v1, vector_base<T> const & v2) \
+     { \
+       return viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_##OPNAME> >(v1, v2); \
+     } \
+ \
+     template <typename V1, typename V2, typename OP, typename T> \
+     viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T>, op_element_binary<op_##OPNAME> > \
+     element_##OPNAME(vector_expression<const V1, const V2, OP> const & proxy, vector_base<T> const & v2) \
+     { \
+       return viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T>, op_element_binary<op_##OPNAME> >(proxy, v2); \
+     } \
+ \
+     template <typename T, typename V2, typename V3, typename OP> \
+     viennacl::vector_expression<const vector_base<T>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> > \
+     element_##OPNAME(vector_base<T> const & v1, vector_expression<const V2, const V3, OP> const & proxy) \
+     { \
+       return viennacl::vector_expression<const vector_base<T>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> >(v1, proxy); \
+     } \
+ \
+     template <typename V1, typename V2, typename OP1, \
+               typename V3, typename V4, typename OP2> \
+     viennacl::vector_expression<const vector_expression<const V1, const V2, OP1>, \
+                                 const vector_expression<const V3, const V4, OP2>, \
+                                 op_element_binary<op_##OPNAME> > \
+     element_##OPNAME(vector_expression<const V1, const V2, OP1> const & proxy1, \
+                      vector_expression<const V3, const V4, OP2> const & proxy2) \
+     {\
+       return viennacl::vector_expression<const vector_expression<const V1, const V2, OP1>, \
+                                          const vector_expression<const V3, const V4, OP2>, \
+                                          op_element_binary<op_##OPNAME> >(proxy1, proxy2); \
+     }
+ 
+     VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(prod)  //for element_prod()
+     VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(div)   //for element_div()
+     VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(pow)   //for element_pow()
+ 
+ #undef VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS
+ 
+ // Helper macro for generating unary element-wise operations such as element_exp(), element_sin(), etc. without unnecessary code duplication */
+ #define VIENNACL_MAKE_UNARY_ELEMENT_OP(funcname) \
+     template <typename T> \
+     viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_##funcname> > \
+     element_##funcname(vector_base<T> const & v) \
+     { \
+       return viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_##funcname> >(v, v); \
+     } \
+     template <typename LHS, typename RHS, typename OP> \
+     viennacl::vector_expression<const vector_expression<const LHS, const RHS, OP>, \
+                                 const vector_expression<const LHS, const RHS, OP>, \
+                                 op_element_unary<op_##funcname> > \
+     element_##funcname(vector_expression<const LHS, const RHS, OP> const & proxy) \
+     { \
+       return viennacl::vector_expression<const vector_expression<const LHS, const RHS, OP>, \
+                                          const vector_expression<const LHS, const RHS, OP>, \
+                                          op_element_unary<op_##funcname> >(proxy, proxy); \
+     } \
+ 
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(abs)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(acos)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(asin)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(atan)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(ceil)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(cos)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(cosh)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(exp)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(fabs)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(floor)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(log)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(log10)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(sin)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(sinh)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(sqrt)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(tan)
+     VIENNACL_MAKE_UNARY_ELEMENT_OP(tanh)
+ 
+ #undef VIENNACL_MAKE_UNARY_ELEMENT_OP
+ 
+     /** \endcond */
+ 
+     ///////////////////////// Norms and inner product ///////////////////
+ 
+ 
+     //implementation of inner product:
+     //namespace {
+ 
+     /** @brief Computes the inner product of two vectors - dispatcher interface
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the gpu)
+      */
+     template <typename T>
+     void inner_prod_impl(vector_base<T> const & vec1,
+                          vector_base<T> const & vec2,
+                          scalar<T> & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in mult()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mult");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),
 +                               static_cast<value_type>(alpha),
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)))
 +                            );        
++=======
+       assert( vec1.size() == vec2.size() && bool("Size mismatch") );
+ 
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::inner_prod_impl(vec1, vec2, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::inner_prod_impl(vec1, vec2, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::inner_prod_impl(vec1, vec2, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Scales a vector inplace. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result *= alpha, where alpha is a gpu scalar
-     *
-     * @param vec    The vector to be scaled.
-     * @param alpha  The scaling factor.
-     */
-     template <typename V1, typename S2>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_scalar<S2>::value
-                                 >::type
-     inplace_mult(V1 & vec,
-                  S2 const & alpha)
+     // vector expression on lhs
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                          vector_base<T> const & vec2,
+                          scalar<T> & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mult");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),
 +                               alpha)
 +                            );
++=======
+       viennacl::vector<T> temp = vec1;
+       inner_prod_impl(temp, vec2, result);
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Scales a vector inplace. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result *= alpha, where alpha is a cpu scalar
-     *
-     * @param vec    The vector to be scaled.
-     * @param alpha  The scaling factor.
-     */
-     template <typename V1, typename S2>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_cpu_scalar<S2>::value
-                                 >::type
-     inplace_mult(V1 & vec,
-                  S2 alpha)
+ 
+     // vector expression on rhs
+     template <typename T, typename LHS, typename RHS, typename OP>
+     void inner_prod_impl(vector_base<T> const & vec1,
+                          viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                          scalar<T> & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mult");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), 
 +                                cl_uint(viennacl::traits::start(vec)), 
 +                                cl_uint(viennacl::traits::stride(vec)), 
 +                                cl_uint(viennacl::traits::size(vec)), 
 +                               static_cast<value_type>(alpha))
 +                            );        
++=======
+       viennacl::vector<T> temp = vec2;
+       inner_prod_impl(vec1, temp, result);
++>>>>>>> upstream/1.5.1
      }
  
-     //result = vec / scalar
-     /** @brief Scales a vector. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result = vec / alpha, where alpha is a gpu scalar
-     *
-     * @param vec    The vector to be scaled.
-     * @param alpha  The (inverse) scaling factor.
-     * @param result The result vector.
-     */
-     template <typename V1, typename S2, typename V3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_scalar<S2>::value
-                                   && viennacl::is_vector<V3>::value
-                                 >::type
-     divide(V1 const & vec,
-            S2 const & alpha,
-            V3 & result)
+ 
+     // vector expression on lhs and rhs
+     template <typename LHS1, typename RHS1, typename OP1,
+               typename LHS2, typename RHS2, typename OP2, typename T>
+     void inner_prod_impl(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                          viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                          scalar<T> & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in divide()!");
 +
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "divide");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), 
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),
 +                               alpha,
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)))
 +                            );
++=======
+       viennacl::vector<T> temp1 = vec1;
+       viennacl::vector<T> temp2 = vec2;
+       inner_prod_impl(temp1, temp2, result);
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Scales a vector inplace. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result *= alpha, where alpha is a gpu scalar
-     *
-     * @param vec    The vector to be scaled.
-     * @param alpha  The (inverse) scaling factor.
-     */
-     template <typename V1, typename S2>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_scalar<S2>::value
-                                 >::type
-     inplace_divide(V1 & vec,
-                    S2 const & alpha)
+ 
+ 
+ 
+     /** @brief Computes the inner product of two vectors with the final reduction step on the CPU - dispatcher interface
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the gpu)
+      */
+     template <typename T>
+     void inner_prod_cpu(vector_base<T> const & vec1,
+                         vector_base<T> const & vec2,
+                         T & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_divide");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)), 
 +                               alpha) 
 +                            );
++=======
+       assert( vec1.size() == vec2.size() && bool("Size mismatch") );
+ 
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::inner_prod_impl(vec1, vec2, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::inner_prod_cpu(vec1, vec2, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::inner_prod_cpu(vec1, vec2, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     //result = factor * vec1 + vec2
-     /** @brief Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result = alpha * vec1 + vec2, where alpha is a gpu scalar
-     *
-     * @param vec1    The first added
-     * @param alpha  The scaling factor for the first addend.
-     * @param vec2    The second added.
-     * @param result The result vector.
-     */
-     template <typename V1, typename S2, typename V3, typename V4>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_scalar<S2>::value
-                                   && viennacl::is_vector<V3>::value
-                                   && viennacl::is_vector<V4>::value
-                                 >::type
-     mul_add(V1 const & vec1,
-             S2 const & alpha,
-             V3 const & vec2,
-             V4 & result)
+     // vector expression on lhs
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                         vector_base<T> const & vec2,
+                         T & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in mul_add()!");
 +      
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_add");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               alpha,
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)),
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)))
 +                            );        
++=======
+       viennacl::vector<T> temp = vec1;
+       inner_prod_cpu(temp, vec2, result);
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result = alpha * vec1 + vec2, where alpha is a cpu scalar
-     *
-     * @param vec1    The first added
-     * @param alpha   The scaling factor for the first addend.
-     * @param vec2    The second added.
-     * @param result  The result vector.
-     */
-     template <typename V1, typename SCALARTYPE, typename V3, typename V4>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                   && viennacl::is_vector<V3>::value
-                                   && viennacl::is_vector<V4>::value
-                                 >::type
-     mul_add(V1 const & vec1,
-             SCALARTYPE alpha,
-             V3 const & vec2,
-             V4 & result)
+ 
+     // vector expression on rhs
+     template <typename T, typename LHS, typename RHS, typename OP>
+     void inner_prod_cpu(vector_base<T> const & vec1,
+                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                         T & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in mul_add()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mul_add");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)), 
 +                               static_cast<value_type>(alpha),
 +                               viennacl::traits::handle(vec2), 
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)), 
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)))
 +                            );
++=======
+       viennacl::vector<T> temp = vec2;
+       inner_prod_cpu(vec1, temp, result);
++>>>>>>> upstream/1.5.1
      }
  
-     //vec1 += factor * vec2
-     /** @brief Inplace Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes vec1 += alpha * vec2, where alpha is a gpu scalar
-     *
-     * @param vec1    The first added
-     * @param alpha   The scaling factor for the first addend.
-     * @param vec2    The second added.
-     */
-     template <typename V1, typename V2, typename S3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_scalar<S3>::value
-                                 >::type
-     inplace_mul_add(V1 & vec1,
-                     V2 const & vec2,
-                     S3 const & alpha)
+ 
+     // vector expression on lhs and rhs
+     template <typename LHS1, typename RHS1, typename OP1,
+               typename LHS2, typename RHS2, typename OP2, typename S3>
+     void inner_prod_cpu(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                         viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                         S3 & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_mul_add()!");
 +      
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_add");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)), 
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)), 
 +                               alpha));
++=======
+       viennacl::vector<S3> temp1 = vec1;
+       viennacl::vector<S3> temp2 = vec2;
+       inner_prod_cpu(temp1, temp2, result);
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Inplace Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes vec1 += alpha * vec2, where alpha is a cpu scalar
-     *
-     * @param vec1    The first added
-     * @param vec2    The second added.
-     * @param alpha   The scaling factor for the first addend.
-     */
-     template <typename V1, typename V2, typename SCALARTYPE>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                 >::type
-     inplace_mul_add(V1 & vec1,
-                     V2 const & vec2,
-                     SCALARTYPE alpha)
+ 
+ 
+     /** @brief Computes the inner products <x, y1>, <x, y2>, ..., <x, y_N> and writes the result to a (sub-)vector
+      *
+      * @param x       The common vector
+      * @param y_tuple A collection of vector, all of the same size.
+      * @param result  The result scalar (on the gpu). Needs to match the number of elements in y_tuple
+      */
+     template <typename T>
+     void inner_prod_impl(vector_base<T> const & x,
+                          vector_tuple<T> const & y_tuple,
+                          vector_base<T> & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_mul_add()!");
 +
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mul_add");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)), 
 +                                cl_uint(viennacl::traits::size(vec1)), 
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)), 
 +                                cl_uint(viennacl::traits::size(vec2)), 
 +                               value_type(alpha)));
++=======
+       assert( x.size() == y_tuple.const_at(0).size() && bool("Size mismatch") );
+       assert( result.size() == y_tuple.const_size() && bool("Number of elements does not match result size") );
+ 
+       switch (viennacl::traits::handle(x).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::inner_prod_impl(x, y_tuple, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::inner_prod_impl(x, y_tuple, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::inner_prod_impl(x, y_tuple, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Multiply-subtract operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes result = alpha * vec1 - vec2, where alpha is a gpu scalar
+ 
+     /** @brief Computes the l^1-norm of a vector - dispatcher interface
      *
-     * @param vec1    The first vector operand
-     * @param alpha   The scaling factor for the first vector.
-     * @param vec2    The second operand.
-     * @param result  The result vector.
+     * @param vec The vector
+     * @param result The result scalar
      */
-     template <typename V1, typename S2, typename V3, typename V4>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_scalar<S2>::value
-                                   && viennacl::is_vector<V3>::value
-                                   && viennacl::is_vector<V4>::value
-                                 >::type
-     mul_sub(V1 const & vec1,
-             S2 const & alpha,
-             V3 const & vec2,
-             V4 & result)
+     template <typename T>
+     void norm_1_impl(vector_base<T> const & vec,
+                      scalar<T> & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
 +             && "Incompatible vector sizes in mul_sub()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_sub");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)), 
 +                               alpha,
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)), 
 +                               viennacl::traits::handle(result),
 +                                cl_uint(viennacl::traits::start(result)),
 +                                cl_uint(viennacl::traits::stride(result)),
 +                                cl_uint(viennacl::traits::size(result)))
 +                            );
++=======
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::norm_1_impl(vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::norm_1_impl(vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::norm_1_impl(vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
  
-     /** @brief Inplace Multiply-subtract operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes vec1 -= alpha * vec2, where alpha is a gpu scalar
+     /** @brief Computes the l^1-norm of a vector - interface for a vector expression. Creates a temporary.
      *
-     * @param vec1    The result vector which is updated
-     * @param vec2    The second operand.
-     * @param alpha   The scaling factor for the vector update.
+     * @param vec    The vector expression
+     * @param result The result scalar
      */
-     template <typename V1, typename V2, typename S3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_scalar<S3>::value
-                                 >::type
-     inplace_mul_sub(V1 & vec1,
-                     V2 const & vec2,
-                     S3 const & alpha)
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      S2 & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_mul_sub()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_sub");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)), 
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)), 
 +                               alpha)
 +                            );        
++=======
+       viennacl::vector<typename viennacl::result_of::cpu_value_type<S2>::type> temp = vec;
+       norm_1_impl(temp, result);
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Inplace divide-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes vec1 += vec2 / alpha, where alpha is a gpu scalar
+ 
+ 
+     /** @brief Computes the l^1-norm of a vector with final reduction on the CPU
      *
-     * @param vec1    The first vector
-     * @param vec2    The vector update
-     * @param alpha   The scaling factor for the second vector.
+     * @param vec The vector
+     * @param result The result scalar
      */
-     template <typename V1, typename V2, typename S3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_scalar<S3>::value
-                                 >::type
-     inplace_div_add(V1 & vec1,
-                     V2 const & vec2,
-                     S3 const & alpha)
+     template <typename T>
+     void norm_1_cpu(vector_base<T> const & vec,
+                     T & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_div_add()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_add");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)), 
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)), 
 +                               alpha)
 +                            );
++=======
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::norm_1_impl(vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::norm_1_cpu(vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::norm_1_cpu(vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Inplace divide-subtract operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-     *
-     * Computes vec1 -= vec2 / alpha, where alpha is a gpu scalar
+     /** @brief Computes the l^1-norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
      *
-     * @param vec1    The first vector
-     * @param vec2    The vector update
-     * @param alpha   The scaling factor for the second vector.
+     * @param vec    The vector expression
+     * @param result The result scalar
      */
-     template <typename V1, typename V2, typename S3>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_scalar<S3>::value
-                                 >::type
-     inplace_div_sub(V1 & vec1,
-                     V2 const & vec2,
-                     S3 const & alpha)
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     S2 & result)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inplace_div_sub()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_sub");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)),
 +                               alpha)
 +                            );
++=======
+       viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+       norm_1_cpu(temp, result);
++>>>>>>> upstream/1.5.1
      }
  
  
-     ///////////////////////// Norms and inner product ///////////////////
  
  
++<<<<<<< HEAD
 +    //implementation of inner product:
 +    //namespace {
 +    /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
 +     *
 +     * @param vec1 The first vector
 +     * @param vec2 The second vector
 +     * @param result The result scalar (on the gpu)
 +     * @param dummy  Dummy parameter used for SFINAE
 +     */
 +    template <typename V1, typename V2, typename S3>
 +    void inner_prod_impl(V1 const & vec1,
 +                         V2 const & vec2,
 +                         S3 & result,
 +                         typename viennacl::enable_if< viennacl::is_vector<V1>::value
 +                                                       && viennacl::is_vector<V2>::value
 +                                                       && viennacl::is_scalar<S3>::value
 +#ifdef _MSC_VER
 +                                                     >::type * dummy = 0)
 +#else
 +                                                     >::type * dummy)
 +#endif                                                   
 +    {
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +    
 +      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
 +             && "Incompatible vector sizes in inner_prod_impl()!");
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inner_prod");
 +      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
 +      unsigned int work_groups = k.global_work_size() / k.local_work_size();
 +      
 +      static viennacl::vector<value_type> temp(work_groups);
 +      
 +      //Note: Number of work groups MUST be a power of two!
 +      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
 +      assert( work_groups * k.local_work_size() == k.global_work_size() );
 +      assert( (k.global_work_size() / k.local_work_size()) == 1 
 +              || (k.global_work_size() / k.local_work_size()) == 2 
 +              || (k.global_work_size() / k.local_work_size()) == 4
 +              || (k.global_work_size() / k.local_work_size()) == 8
 +              || (k.global_work_size() / k.local_work_size()) == 16
 +              || (k.global_work_size() / k.local_work_size()) == 32
 +              || (k.global_work_size() / k.local_work_size()) == 64
 +              || (k.global_work_size() / k.local_work_size()) == 128
 +              || (k.global_work_size() / k.local_work_size()) == 256
 +              || (k.global_work_size() / k.local_work_size()) == 512 );
 +              
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),
 +                                cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),
 +                                cl_uint(viennacl::traits::size(vec2)),
 +                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
 +                               temp));        
 +
 +      viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
 +      
 +      ksum.local_work_size(0, work_groups);
 +      ksum.global_work_size(0, work_groups);
 +      viennacl::ocl::enqueue(ksum(viennacl::traits::handle(temp),
 +                                  cl_uint(viennacl::traits::start(temp)),
 +                                  cl_uint(viennacl::traits::stride(temp)),
 +                                  cl_uint(viennacl::traits::size(temp)),
 +                                  result)
 +                            );
++=======
+     /** @brief Computes the l^2-norm of a vector - dispatcher interface
+     *
+     * @param vec The vector
+     * @param result The result scalar
+     */
+     template <typename T>
+     void norm_2_impl(vector_base<T> const & vec,
+                      scalar<T> & result)
+     {
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::norm_2_impl(vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::norm_2_impl(vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::norm_2_impl(vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
  
-     //public interface of inner product
-     /** @brief Computes the inner product of two vectors.
+     /** @brief Computes the l^2-norm of a vector - interface for a vector expression. Creates a temporary.
      *
-     * @param vec1 The first vector
-     * @param vec2 The second vector
-     * @return The result
+     * @param vec    The vector expression
+     * @param result The result scalar
      */
-     template <typename V1, typename V2>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value,
-                                   viennacl::scalar_expression< const V1, 
-                                                                const V2,
-                                                                viennacl::op_inner_prod >
-                                 >::type
-     inner_prod_impl(V1 const & vec1,
-                     V2 const & vec2)
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      scalar<T> & result)
      {
-       return viennacl::scalar_expression< const V1, 
-                                           const V2,
-                                           viennacl::op_inner_prod >(vec1, vec2);
+       viennacl::vector<T> temp = vec;
+       norm_2_impl(temp, result);
      }
  
  
@@@ -900,292 -664,217 +1224,387 @@@
      *
      * @param vec The vector
      * @param result The result scalar
 +    * @param dummy  Dummy parameter used for SFINAE
      */
-     template <typename V1, typename S2>
-     void norm_1_impl(V1 const & vec,
-                      S2 & result,
-                      typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                    && viennacl::is_scalar<S2>::value
- #ifdef _MSC_VER
-                                                  >::type * dummy = 0)
- #else
-                                                  >::type * dummy)
- #endif                                                   
+     template <typename T>
+     void norm_2_cpu(vector_base<T> const & vec,
+                     T & result)
      {
-       typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-       
-       //TODO: Ensure that correct alignment is chosen for the kernels.
-       const unsigned int ALIGNMENT = V1::alignment;
-       
-       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_1");
-       //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-       
-       if (k.local_work_size() != k.global_work_size())
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
        {
-         //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
-         k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
-         k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::norm_2_impl(vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::norm_2_cpu(vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::norm_2_cpu(vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
        }
++<<<<<<< HEAD
 +      
 +      unsigned int work_groups = k.global_work_size() / k.local_work_size();
 +      viennacl::vector<value_type> temp(work_groups);
 +
 +      //Note: Number of work groups MUST be a power of two!
 +      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
 +      assert( work_groups * k.local_work_size() == k.global_work_size() );
 +      assert( (k.global_work_size() / k.local_work_size()) == 1 
 +             || (k.global_work_size() / k.local_work_size()) == 2 
 +             || (k.global_work_size() / k.local_work_size()) == 4
 +             || (k.global_work_size() / k.local_work_size()) == 8
 +             || (k.global_work_size() / k.local_work_size()) == 16
 +             || (k.global_work_size() / k.local_work_size()) == 32
 +             || (k.global_work_size() / k.local_work_size()) == 64
 +             || (k.global_work_size() / k.local_work_size()) == 128
 +             || (k.global_work_size() / k.local_work_size()) == 256
 +             || (k.global_work_size() / k.local_work_size()) == 512 );
 +               
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),                                 
 +                                viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
 +                                temp));        
 +      
 +      viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
 +      
 +      ksum.local_work_size(0, work_groups);
 +      ksum.global_work_size(0, work_groups);
 +      viennacl::ocl::enqueue(ksum(viennacl::traits::handle(temp),
 +                                  cl_uint(viennacl::traits::start(temp)),
 +                                  cl_uint(viennacl::traits::stride(temp)),
 +                                  cl_uint(viennacl::traits::size(temp)),
 +                                  result)
 +                            );
++=======
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Computes the l^2-norm of a vector - implementation
+     /** @brief Computes the l^2-norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
+     *
+     * @param vec    The vector expression
+     * @param result The result scalar
+     */
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     S2 & result)
+     {
+       viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+       norm_2_cpu(temp, result);
+     }
+ 
+ 
+ 
+ 
+     /** @brief Computes the supremum-norm of a vector
      *
      * @param vec The vector
      * @param result The result scalar
 +    * @param dummy  Dummy parameter used for SFINAE
      */
-     template <typename V1, typename S2>
-     void norm_2_impl(V1 const & vec,
-                      S2 & result,
-                      typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                   && viennacl::is_scalar<S2>::value
- #ifdef _MSC_VER
-                                                  >::type * dummy = 0)
- #else
-                                                  >::type * dummy)
- #endif                                                   
+     template <typename T>
+     void norm_inf_impl(vector_base<T> const & vec,
+                        scalar<T> & result)
      {
-       typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-       
-       //TODO: Ensure that correct alignment is chosen for the kernels.
-       const unsigned int ALIGNMENT = V1::alignment;
-       
-       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_2");
-       //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-       
-       if (k.local_work_size() != k.global_work_size())
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
        {
-         //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
-         k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
-         k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::norm_inf_impl(vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::norm_inf_impl(vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::norm_inf_impl(vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
        }
+     }
  
++<<<<<<< HEAD
 +      unsigned int work_groups = k.global_work_size() / k.local_work_size();
 +      viennacl::vector<value_type> temp(work_groups);
 +        
 +      //Note: Number of work groups MUST be a power of two!
 +      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
 +      assert( work_groups * k.local_work_size() == k.global_work_size() );
 +      assert( (k.global_work_size() / k.local_work_size()) == 1 
 +             || (k.global_work_size() / k.local_work_size()) == 2 
 +             || (k.global_work_size() / k.local_work_size()) == 4
 +             || (k.global_work_size() / k.local_work_size()) == 8
 +             || (k.global_work_size() / k.local_work_size()) == 16
 +             || (k.global_work_size() / k.local_work_size()) == 32
 +             || (k.global_work_size() / k.local_work_size()) == 64
 +             || (k.global_work_size() / k.local_work_size()) == 128
 +             || (k.global_work_size() / k.local_work_size()) == 256
 +             || (k.global_work_size() / k.local_work_size()) == 512 );
 +               
 +        viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                  cl_uint(viennacl::traits::start(vec)),
 +                                  cl_uint(viennacl::traits::stride(vec)),
 +                                  cl_uint(viennacl::traits::size(vec)),                                 
 +                                 viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
 +                                 temp)
 +                              );
 +
 +        viennacl::ocl::kernel & sqrt_sum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sqrt_sum");
 +        
 +        sqrt_sum.local_work_size(0, work_groups);
 +        sqrt_sum.global_work_size(0, work_groups);
 +        viennacl::ocl::enqueue(
 +                        sqrt_sum(viennacl::traits::handle(temp),
 +                                  cl_uint(viennacl::traits::start(temp)),
 +                                  cl_uint(viennacl::traits::stride(temp)),
 +                                  cl_uint(viennacl::traits::size(temp)),
 +                                 result)
 +                              );
++=======
+     /** @brief Computes the supremum norm of a vector - interface for a vector expression. Creates a temporary.
+     *
+     * @param vec    The vector expression
+     * @param result The result scalar
+     */
+     template <typename LHS, typename RHS, typename OP, typename T>
+     void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                        scalar<T> & result)
+     {
+       viennacl::vector<T> temp = vec;
+       norm_inf_impl(temp, result);
++>>>>>>> upstream/1.5.1
      }
  
-     /** @brief Computes the supremum-norm of a vector
+ 
+     /** @brief Computes the supremum-norm of a vector with final reduction on the CPU
      *
      * @param vec The vector
      * @param result The result scalar
 +    * @param dummy  Dummy parameter used for SFINAE
      */
-     template <typename V1, typename S2>
-     void norm_inf_impl(V1 const & vec,
-                        S2 & result,
-                        typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                      && viennacl::is_scalar<S2>::value
- #ifdef _MSC_VER
-                                                    >::type * dummy = 0)
- #else
-                                                    >::type * dummy)
- #endif                                                   
+     template <typename T>
+     void norm_inf_cpu(vector_base<T> const & vec,
+                       T & result)
      {
-       typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-       
-       //TODO: Ensure that correct alignment is chosen for the kernels.
-       const unsigned int ALIGNMENT = V1::alignment;
-       
-       //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_inf");
- 
-       if (k.local_work_size() != k.global_work_size())
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
        {
-         //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
-         k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
-         k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::norm_inf_impl(vec, result);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::norm_inf_cpu(vec, result);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::norm_inf_cpu(vec, result);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
        }
++<<<<<<< HEAD
 +      
 +      unsigned int work_groups = k.global_work_size() / k.local_work_size();
 +      viennacl::vector<value_type> temp(work_groups);
 +        
 +      //Note: Number of work groups MUST be a power of two!
 +      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
 +      assert( work_groups * k.local_work_size() == k.global_work_size() );
 +      assert( work_groups == 1 
 +             || work_groups == 2 
 +             || work_groups == 4
 +             || work_groups == 8
 +             || work_groups == 16
 +             || work_groups == 32
 +             || work_groups == 64
 +             || work_groups == 128
 +             || work_groups == 256
 +             || work_groups == 512 );
 +               
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),                                 
 +                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
 +                               temp));
 +      //viennacl::ocl::get_queue().finish();
 +      
 +      //part 2: parallel reduction of reduced kernel:
 +      viennacl::ocl::kernel & max_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "vmax");
 +      max_kernel.local_work_size(0, work_groups);
 +      max_kernel.global_work_size(0, work_groups);
 +      
 +      viennacl::ocl::enqueue(
 +                       max_kernel(viennacl::traits::handle(temp),
 +                                   cl_uint(viennacl::traits::start(temp)),
 +                                   cl_uint(viennacl::traits::stride(temp)),
 +                                   cl_uint(viennacl::traits::size(temp)),
 +                                  result)
 +                            );
++=======
++>>>>>>> upstream/1.5.1
      }
  
-     //This function should return a CPU scalar, otherwise statements like 
-     // vcl_rhs[index_norm_inf(vcl_rhs)] 
+     /** @brief Computes the supremum norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
+     *
+     * @param vec    The vector expression
+     * @param result The result scalar
+     */
+     template <typename LHS, typename RHS, typename OP, typename S2>
+     void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                       S2 & result)
+     {
+       viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+       norm_inf_cpu(temp, result);
+     }
+ 
+ 
+     //This function should return a CPU scalar, otherwise statements like
+     // vcl_rhs[index_norm_inf(vcl_rhs)]
      // are ambiguous
      /** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
      *
      * @param vec The vector
-     * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+     * @return The result. Note that the result must be a CPU scalar
      */
-     template <typename V1>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value,
-                                   cl_uint
-                                 >::type
-     index_norm_inf(V1 const & vec)
+     template <typename T>
+     vcl_size_t index_norm_inf(vector_base<T> const & vec)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      viennacl::ocl::handle<cl_mem> h = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
 +      
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "index_norm_inf");
 +      //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
 +
 +      k.global_work_size(0, k.local_work_size());
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
 +                                cl_uint(viennacl::traits::start(vec)),
 +                                cl_uint(viennacl::traits::stride(vec)),
 +                                cl_uint(viennacl::traits::size(vec)),                                 
 +                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
 +                               viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
 +      
 +      //read value:
 +      cl_uint result;
 +      cl_int err;
 +      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
 +      VIENNACL_ERR_CHECK(err);
 +      return result;
++=======
+       switch (viennacl::traits::handle(vec).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           return viennacl::linalg::host_based::index_norm_inf(vec);
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           return viennacl::linalg::opencl::index_norm_inf(vec);
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           return viennacl::linalg::cuda::index_norm_inf(vec);
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
-     
-     //TODO: Special case vec1 == vec2 allows improvement!!
+ 
+     /** @brief Computes the supremum norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
+     *
+     * @param vec    The vector expression
+     */
+     template <typename LHS, typename RHS, typename OP>
+     vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec)
+     {
+       viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+       return index_norm_inf(temp);
+     }
+ 
+ 
      /** @brief Computes a plane rotation of two vectors.
      *
      * Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
      *
      * @param vec1   The first vector
      * @param vec2   The second vector
-     * @param alpha  The first transformation coefficient
-     * @param beta   The second transformation coefficient
+     * @param alpha  The first transformation coefficient (CPU scalar)
+     * @param beta   The second transformation coefficient (CPU scalar)
      */
-     template <typename V1, typename V2, typename SCALARTYPE>
-     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                   && viennacl::is_vector<V2>::value
-                                   && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                 >::type
-     plane_rotation(V1 & vec1,
-                    V2 & vec2,
-                    SCALARTYPE alpha,
-                    SCALARTYPE beta)
+     template <typename T>
+     void plane_rotation(vector_base<T> & vec1,
+                         vector_base<T> & vec2,
+                         T alpha, T beta)
      {
++<<<<<<< HEAD
 +      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
 +      
 +      //TODO: Ensure that correct alignment is chosen for the kernels.
 +      const unsigned int ALIGNMENT = V1::alignment;
 +      
 +      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "plane_rotation");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                                cl_uint(viennacl::traits::start(vec1)),
 +                                cl_uint(viennacl::traits::stride(vec1)),                                 
 +                                cl_uint(viennacl::traits::size(vec1)),                                 
 +                               viennacl::traits::handle(vec2),
 +                                cl_uint(viennacl::traits::start(vec2)),
 +                                cl_uint(viennacl::traits::stride(vec2)),                                 
 +                                cl_uint(viennacl::traits::size(vec2)),                                 
 +                               alpha,
 +                               beta)
 +                            );
++=======
+       switch (viennacl::traits::handle(vec1).get_active_handle_id())
+       {
+         case viennacl::MAIN_MEMORY:
+           viennacl::linalg::host_based::plane_rotation(vec1, vec2, alpha, beta);
+           break;
+ #ifdef VIENNACL_WITH_OPENCL
+         case viennacl::OPENCL_MEMORY:
+           viennacl::linalg::opencl::plane_rotation(vec1, vec2, alpha, beta);
+           break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+         case viennacl::CUDA_MEMORY:
+           viennacl::linalg::cuda::plane_rotation(vec1, vec2, alpha, beta);
+           break;
+ #endif
+         case viennacl::MEMORY_NOT_INITIALIZED:
+           throw memory_exception("not initialised!");
+         default:
+           throw memory_exception("not implemented");
+       }
++>>>>>>> upstream/1.5.1
      }
-     
+ 
    } //namespace linalg
  } //namespace viennacl
  
diff --cc viennacl/matrix.hpp
index 9f4d46b,9ac27cb..bd21d6a
--- a/viennacl/matrix.hpp
+++ b/viennacl/matrix.hpp
@@@ -1,1078 -1,3048 +1,4129 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_MATRIX_HPP_
 +#define VIENNACL_MATRIX_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file matrix.hpp
 +    @brief Implementation of the dense matrix class
 +*/
 +
 +#include "viennacl/forwards.h"
 +#include "viennacl/ocl/backend.hpp"
 +#include "viennacl/scalar.hpp"
 +#include "viennacl/vector.hpp"
 +#include "viennacl/linalg/matrix_operations.hpp"
 +#include "viennacl/tools/tools.hpp"
 +#include "viennacl/tools/matrix_size_deducer.hpp"
 +#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
 +#include "viennacl/meta/result_of.hpp"
 +#include "viennacl/meta/enable_if.hpp"
 +
 +namespace viennacl
 +{
 +    /** @brief A tag for row-major storage of a dense matrix. */
 +    struct row_major
 +    {
 +      /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
 +      *
 +      * @param i   row index
 +      * @param j   column index
 +      * @param num_rows  number of entries per row (including alignment)
 +      * @param num_cols  number of entries per column (including alignment)
 +      */
 +      static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t num_cols)
 +      {
 +        return i * num_cols + j;
 +      }
 +      
 +      static vcl_size_t internal_size1(vcl_size_t rows, vcl_size_t alignment)
 +      {
 +        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(rows, alignment);;
 +      }
 +      
 +      static vcl_size_t internal_size2(vcl_size_t cols, vcl_size_t alignment)
 +      {
 +        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(cols, alignment);
 +      }
 +    };
 +
 +    struct column_major
 +    {
 +      /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
 +      *
 +      * @param i   row index
 +      * @param j   column index
 +      * @param num_rows  number of entries per row (including alignment)
 +      * @param num_cols  number of entries per column (including alignment)
 +      */
 +      static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t num_cols)
 +      {
 +        return i + j * num_rows;
 +      }
 +      
 +      static vcl_size_t internal_size1(vcl_size_t rows, vcl_size_t alignment)
 +      {
 +        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(rows, alignment);
 +      }
 +      
 +      static vcl_size_t internal_size2(vcl_size_t cols, vcl_size_t alignment)
 +      {
 +        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(cols, alignment);
 +      }
 +    };
 +    
 +    template <typename LHS, typename RHS, typename OP>
 +    class matrix_expression
 +    {
 +      public:
 +        ///** @brief Extracts the vector type from the two operands.
 +        //*/
 +        //typedef typename viennacl::tools::VECTOR_EXTRACTOR<LHS, RHS>::ResultType    VectorType;
 +      
 +        matrix_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
 +        
 +        /** @brief Get left hand side operand
 +        */
 +        LHS & lhs() const { return _lhs; }
 +        /** @brief Get right hand side operand
 +        */
 +        RHS & rhs() const { return _rhs; }
 +        
 +        /** @brief Returns the size of the result vector */
 +        std::size_t size1() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size1(_lhs, _rhs); }
 +        std::size_t size2() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size2(_lhs, _rhs); }
 +        
 +      private:
 +        /** @brief The left hand side operand */
 +        typename result_of::matrix_expression_internal_storage<LHS>::type _lhs;
 +        /** @brief The right hand side operand */
 +        typename result_of::matrix_expression_internal_storage<RHS>::type _rhs;
 +    };
 +    
 +    
 +    /** @brief A tag indicating iteration along increasing row index of a matrix */
 +    struct row_iteration {};
 +    
 +    /** @brief A tag indicating iteration along increasing columns index of a matrix */
 +    struct col_iteration {};
 +
 +    //STL-like iterator. TODO: STL-compliance...
 +    template <typename ROWCOL, typename MATRIXTYPE>
 +    class matrix_iterator
 +    {
 +        typedef matrix_iterator<ROWCOL, MATRIXTYPE>    self_type;
 +      public:
 +        typedef typename MATRIXTYPE::value_type       value_type;
 +        
 +        matrix_iterator(MATRIXTYPE & mat, 
 +                        std::size_t start_row,
 +                        std::size_t start_col) : mat_(mat), row_(start_row), col_(start_col) {};
 +        
 +        value_type operator*(void) { return mat_(row_, col_); }
 +        self_type & operator++(void) { viennacl::tools::MATRIX_ITERATOR_INCREMENTER<ROWCOL, MATRIXTYPE>::apply(mat_, row_, col_); return *this; }
 +        self_type & operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
 +        
 +        bool operator==(self_type const & other) { return (row_ == other.row_) && (col_ == other.col_); }
 +        bool operator!=(self_type const & other) { return !(*this == other); }
 +        
 +        vcl_size_t index1() { return row_; }
 +        vcl_size_t index2() { return col_; }
 +        
 +        MATRIXTYPE & operator()(void) const { return mat_; }
 +      
 +      private:
 +        MATRIXTYPE & mat_;
 +        vcl_size_t row_;
 +        vcl_size_t col_;
 +    };
 +
 +    /** @brief A dense matrix class
 +    *
 +    * @tparam SCALARTYPE   The underlying scalar type (either float or double)
 +    * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
 +    * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
 +    */
 +    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    class matrix
 +    {
 +      typedef matrix<SCALARTYPE, F, ALIGNMENT>          self_type;
 +    public:
 +      
 +      typedef matrix_iterator<row_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator1;
 +      typedef matrix_iterator<col_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator2;
 +      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
 +      typedef vcl_size_t                                                          size_type;
 +      
 +      /** @brief The default constructor. Does not allocate any memory. */
 +      matrix() : rows_(0), columns_(0)
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +      };
 +      
 +      /** @brief Creates the matrix with the given dimensions
 +      *
 +      * @param rows     Number of rows
 +      * @param columns  Number of columns
 +      */
 +      explicit matrix(size_type rows, size_type columns) :
 +        rows_(rows), columns_(columns)
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +      }
 +
 +      explicit matrix(cl_mem mem, size_type rows, size_type columns) :
 +        rows_(rows), columns_(columns)
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = mem;
 +        elements_.inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
 +      }
 +
 +      template <typename LHS, typename RHS, typename OP>
 +      matrix(matrix_expression< LHS, RHS, OP> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +        
 +        *this = proxy;
 +      }
 +      
 +      // matrix_range
 +
 +      matrix(matrix_range<self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +        
 +        *this = proxy;
 +      }
 +
 +      matrix(matrix_range<const self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +        
 +        *this = proxy;
 +      }
 +      
 +      // matrix_slice
 +
 +      matrix(matrix_slice<self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +        
 +        *this = proxy;
 +      }
 +
 +      matrix(matrix_slice<const self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        KernelClass::init();
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +        
 +        *this = proxy;
 +      }
 +
 +
 +      //copy constructor:
 +      matrix(const matrix<SCALARTYPE, F, ALIGNMENT> & mat) :
 +        rows_(mat.size1()), columns_(mat.size2()),
 +        elements_(viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size()))
 +      {
 +        cl_int err;
 +        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix<SCALARTYPE, F, ALIGNMENT> & mat)
 +      {
 +        resize(mat.size1(), mat.size2(), false);
 +        cl_int err;
 +        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        return *this;
 +      }
 +      
 +      
 +      // A = trans(B). Currently achieved in CPU memory
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                                                            const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                                                            op_trans> & proxy)
 +      {
 +        assert(elements_.get() != proxy.lhs().handle().get() && "Self-assignment of matrix transpose not implemented");
 +        assert(proxy.lhs().size1() == size2() && "Matrix dimensions do not match!");
 +        assert(proxy.lhs().size2() == size1() && "Matrix dimensions do not match!");
 +
 +        resize(proxy.lhs().size2(), proxy.lhs().size1(), false);
 +        
 +        std::vector<SCALARTYPE> temp(proxy.lhs().internal_size());
 +        
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         proxy.lhs().handle().get(), CL_TRUE, 0,
 +                                         sizeof(SCALARTYPE)*proxy.lhs().internal_size(),
 +                                         &(temp[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        viennacl::ocl::get_queue().finish();
 +
 +        // now transpose it
 +        std::vector<SCALARTYPE> temp_trans(internal_size());
 +
 +        for (vcl_size_t i=0; i<proxy.lhs().size1(); ++i)
 +          for (vcl_size_t j=0; j<proxy.lhs().size2(); ++j)
 +            temp_trans[F::mem_index(j,i, internal_size1(), internal_size2())] 
 +             = temp[F::mem_index(i,j, proxy.lhs().internal_size1(), proxy.lhs().internal_size2())];
 +
 +        // write back
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
 +                                                                   sizeof(SCALARTYPE)*internal_size(),
 +                                                                   &(temp_trans[0]));
 +          
 +        return *this;
 +      }
 +
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<self_type> & mat)
 +      {
 +        resize(mat.size1(), mat.size2(), false);
 +        viennacl::linalg::assign(*this, mat);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<const self_type> & mat)
 +      {
 +        resize(mat.size1(), mat.size2(), false);
 +        viennacl::linalg::assign(*this, mat);
 +        return *this;
 +      }
 +
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_slice<self_type> & mat)
 +      {
 +        resize(mat.size1(), mat.size2(), false);
 +        viennacl::linalg::assign(*this, mat);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_slice<const self_type> & mat)
 +      {
 +        resize(mat.size1(), mat.size2(), false);
 +        viennacl::linalg::assign(*this, mat);
 +        return *this;
 +      }
 +
 +
 +      /** @brief Resizes the matrix.
 +      *   Existing entries can be preserved, but 
 +      *
 +      * @param rows       New number of rows
 +      * @param columns    New number of columns
 +      * @param preserve   If true, existing values are preserved. 
 +      */
 +      void resize(size_type rows, size_type columns, bool preserve = true)
 +      {
 +        assert(rows > 0 && columns > 0);
 +        if (preserve)
 +        {
 +          //get old entries:
 +          std::vector< SCALARTYPE > old_entries(internal_size());
 +          cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), //src
 +                                           elements_.get(), //dest
 +                                           CL_TRUE, //blocking
 +                                           0, //offset
 +                                           sizeof(SCALARTYPE)*internal_size(), //size
 +                                           &(old_entries[0]), //destination
 +                                           0, NULL, NULL);
 +          VIENNACL_ERR_CHECK(err);
 +          
 +          //set up entries of new matrix:
 +          std::vector< SCALARTYPE > new_entries(F::internal_size1(rows, ALIGNMENT) * F::internal_size2(columns, ALIGNMENT));
 +          for (size_type i=0; i<rows; ++i)
 +          {
 +            if (i >= rows_)
 +              continue;
 +              
 +            for (size_type j=0; j<columns; ++j)
 +            {
 +              if (j >= columns_)
 +                continue;
 +              new_entries[F::mem_index(i, j, F::internal_size1(rows, ALIGNMENT), F::internal_size2(columns, ALIGNMENT))] 
 +                 = old_entries[F::mem_index(i, j, internal_size1(), internal_size2())];
 +            }
 +          }
 +          
 +          //copy new entries to GPU:
 +          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, new_entries);
 +          rows_ = rows;
 +          columns_ = columns;
 +        }
 +        else //discard old entries:
 +        {
 +          rows_ = rows;
 +          columns_ = columns;
 +          
 +          std::vector< SCALARTYPE > new_entries(F::internal_size1(rows, ALIGNMENT) * F::internal_size2(columns, ALIGNMENT));
 +          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, new_entries);
 +        }
 +      }
 +      
 +      
 +      //read-write access to an element of the vector
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index)
 +      {
 +        return entry_proxy<SCALARTYPE>(F::mem_index(row_index, col_index, internal_size1(), internal_size2()), elements_);
 +      }
 +      
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<SCALARTYPE> operator()(size_type row_index, size_type col_index) const
 +      {
 +        scalar<SCALARTYPE> tmp;
 +        cl_int err;
 +        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                  elements_.get(),
 +                                  tmp.handle().get(),
 +                                  sizeof(SCALARTYPE) * F::mem_index(row_index, col_index, internal_size1(), internal_size2()),
 +                                  0,
 +                                  sizeof(SCALARTYPE),
 +                                  0,
 +                                  NULL,
 +                                  NULL);
 +        //assert(err == CL_SUCCESS);
 +        VIENNACL_ERR_CHECK(err);
 +        return tmp;
 +      }
 +      
 +
 +      matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                         const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                         op_add >
 +      operator + (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
 +      {
 +        return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                  const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                  op_add > (*this, other);
 +      }
 +
 +      // operator +=
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_range< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_slice< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +
 +      template <unsigned int A1, unsigned int A2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_expression< const vector<SCALARTYPE, A1>,
 +                                                                               const vector<SCALARTYPE, A2>,
 +                                                                               op_prod > & proxy) 
 +      {
 +        viennacl::linalg::rank_1_update(*this, proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +
 +      template <unsigned int A1, unsigned int A2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_expression< const matrix_expression< const vector<SCALARTYPE, A1>,
 +                                                                                                        const vector<SCALARTYPE, A2>,
 +                                                                                                        op_prod >,
 +                                                                               const SCALARTYPE,
 +                                                                               op_prod > & proxy) 
 +      {
 +        viennacl::linalg::scaled_rank_1_update(*this, proxy.rhs(), proxy.lhs().lhs(), proxy.lhs().rhs());
 +        return *this;
 +      }
 +      
 +      // operator -
 +      matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                         const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                         op_sub >
 +      operator - (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
 +      {
 +        return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                  const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                  op_sub > (*this, other);
 +      }
 +      
 +      // operator -=
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_range< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_slice< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      template <unsigned int A1, unsigned int A2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_expression< const vector<SCALARTYPE, A1>,
 +                                                                               const vector<SCALARTYPE, A2>,
 +                                                                               op_prod > & proxy) 
 +      {
 +        viennacl::linalg::scaled_rank_1_update(*this, static_cast<SCALARTYPE>(-1.0), proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +
 +
 +      template <unsigned int A1, unsigned int A2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_expression< const matrix_expression< const vector<SCALARTYPE, A1>,
 +                                                                                                        const vector<SCALARTYPE, A2>,
 +                                                                                                        op_prod >,
 +                                                                               const SCALARTYPE,
 +                                                                               op_prod > & proxy) 
 +      {
 +        viennacl::linalg::scaled_rank_1_update(*this, static_cast<SCALARTYPE>(-1.0) * proxy.rhs(), proxy.lhs().lhs(), proxy.lhs().rhs());
 +        return *this;
 +      }
 +      
 +      
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator *= (SCALARTYPE val) 
 +      {
 +        viennacl::linalg::inplace_mult(*this, val);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator *= (scalar<SCALARTYPE> const & val) 
 +      {
 +        viennacl::linalg::inplace_mult(*this, val);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator /= (SCALARTYPE val) 
 +      {
 +        viennacl::linalg::inplace_mult(*this, SCALARTYPE(1.0) / val);
 +        return *this;
 +      }
 +
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator /= (scalar<SCALARTYPE> const & val) 
 +      {
 +        viennacl::linalg::inplace_divide(*this, val);
 +        return *this;
 +      }
 +
 +
 +      //this = A * B and related (with trans())
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< MatrixType1,
 +                                                                              MatrixType2,
 +                                                                              op_prod > & proxy) 
 +      {
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      //this = A + B
 +      template <typename T1, typename T2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> &
 +      operator = (const matrix_expression< const T1,
 +                                           const T2,
 +                                           op_add > & proxy) 
 +      {
 +        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      
 +      //this = A - B
 +      template <typename T1, typename T2>
 +      matrix<SCALARTYPE, F, ALIGNMENT> &
 +      operator = (const matrix_expression< const T1,
 +                                           const T2,
 +                                           op_sub > & proxy) 
 +      {
 +        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      
 +      
 +      
 +
 +      //this = A - B
 +      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                                                               const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                                                               op_sub > & proxy) 
 +      {
 +        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +
 +      /** @brief Returns the number of rows */
 +      const size_type & size1() const { return rows_;}
 +      /** @brief Returns the number of columns */
 +      const size_type & size2() const { return columns_; }
 +      
 +      /** @brief Resets all entries to zero */
 +      void clear()
 +      {
 +        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
 +        
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "clear");
 +        viennacl::ocl::enqueue(k(elements_,
 +                                 cl_uint(0), cl_uint(0),
 +                                 cl_uint(1), cl_uint(1),
 +                                 cl_uint(size1()), cl_uint(size2()),
 +                                 cl_uint(internal_size1()), cl_uint(internal_size2())
 +                                )
 +                              );
 +      }
 +      
 +      
 +      //const unsigned int row_stride() const { return roundUpToNextMultiple<unsigned int>(columns(), ALIGNMENT); }
 +      /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
 +      const size_type internal_size1() const { return F::internal_size1(size1(), ALIGNMENT); }
 +      /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
 +      const size_type internal_size2() const { return F::internal_size2(size2(), ALIGNMENT); }
 +      /** @brief Returns the total amount of allocated memory in multiples of sizeof(SCALARTYPE) */
 +      const size_type internal_size() const { return internal_size1() * internal_size2(); }
 +      
 +      /** @brief Returns the OpenCL handle */
 +      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
 +      
 +      #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
 +      template <typename CPU_MATRIX>
 +      friend void copy(const CPU_MATRIX & cpu_matrix,
 +                      matrix & gpu_matrix );
 +      
 +      template <typename SCALARTYPE2, typename A1, typename A2>
 +      friend void copy(const std::vector< std::vector<SCALARTYPE2, A1>, A2> & cpu_matrix,
 +                      matrix & gpu_matrix );
 +      
 +      template <typename SCALARTYPE2>
 +      friend void fast_copy(SCALARTYPE2 * cpu_matrix_begin,
 +                            SCALARTYPE2 * cpu_matrix_end,
 +                            matrix & gpu_matrix);
 +      
 +      #ifdef VIENNACL_HAVE_EIGEN
 +      friend void copy(const Eigen::MatrixXf & cpu_matrix,
 +                       matrix & gpu_matrix);
 +      
 +      friend void copy(const Eigen::MatrixXd & cpu_matrix,
 +                       matrix & gpu_matrix);
 +      #endif
 +      
 +      #ifdef VIENNACL_HAVE_MTL4
 +      template <typename SCALARTYPE2, typename T>
 +      friend void copy(const mtl::dense2D<SCALARTYPE2, T>& cpu_matrix,
 +                       matrix & gpu_matrix);
 +      #endif
 +      #else
 +      template <typename CPU_MATRIX, typename SCALARTYPE2, typename F2, unsigned int ALIGNMENT2>
 +      friend void copy(const CPU_MATRIX & cpu_matrix,
 +                      matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix );
 +                      
 +      template <typename SCALARTYPE2, typename A1, typename A2, typename F2, unsigned int ALIGNMENT2>
 +      friend void copy(const std::vector< std::vector<SCALARTYPE2, A1>, A2> & cpu_matrix,
 +                       matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix );
 +      
 +      template <typename SCALARTYPE2, typename F2, unsigned int ALIGNMENT2>
 +      friend void fast_copy(SCALARTYPE2 * cpu_matrix_begin,
 +                            SCALARTYPE2 * cpu_matrix_end,
 +                            matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix);
 +      
 +      #ifdef VIENNACL_HAVE_EIGEN
 +      template <typename F2, unsigned int ALIGNMENT2>
 +      friend void copy(const Eigen::MatrixXf & cpu_matrix,
 +                matrix<float, F2, ALIGNMENT2> & gpu_matrix);
 +      
 +      template <typename F2, unsigned int ALIGNMENT2>
 +      friend void copy(const Eigen::MatrixXd & cpu_matrix,
 +                matrix<double, F2, ALIGNMENT2> & gpu_matrix);
 +      #endif
 +      
 +      #ifdef VIENNACL_HAVE_MTL4
 +      template <typename SCALARTYPE2, typename T, typename F2, unsigned int ALIGNMENT2>
 +      friend void copy(const mtl::dense2D<SCALARTYPE2, T>& cpu_matrix,
 +                       matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix);
 +      #endif
 +      #endif                 
 +      
 +    private:
 +      size_type rows_;
 +      size_type columns_;
 +      viennacl::ocl::handle<cl_mem> elements_;
 +    }; //matrix
 +
 +    /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
 +    *
 +    * @param s            STL output stream
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    */
 +    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    std::ostream & operator<<(std::ostream & s, const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
 +    {
 +      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
 +      
 +      std::vector<SCALARTYPE> tmp(gpu_matrix.internal_size());
 +      cl_int err;
 +      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &tmp[0], 0, NULL, NULL);
 +      VIENNACL_ERR_CHECK(err);
 +      viennacl::ocl::get_queue().finish();
 +      
 +      s << "[" << gpu_matrix.size1() << "," << gpu_matrix.size2() << "]";
 +      
 +      s << "(";
 +      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +      {
 +        s << "(";
 +        for (size_type j = 0; j < gpu_matrix.size2(); ++j)
 +        {
 +          s << tmp[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
 +          if (j < gpu_matrix.size2() - 1)
 +            s << ",";
 +        }
 +        s << ")";
 +        if (i < gpu_matrix.size1() - 1)
 +          s << ",";
 +      }
 +      s << ")";
 +      return s;
 +    }
 +
 +    /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
 +    *
 +    * @param s            STL output stream
 +    * @param expr         A matrix expression
 +    */
 +    template<typename LHS, typename RHS, typename OP>
 +    std::ostream & operator<<(std::ostream & s, const matrix_expression<LHS, RHS, OP> & expr)
 +    {
 +      typedef typename viennacl::tools::CPU_SCALAR_TYPE_DEDUCER< typename tools::CONST_REMOVER<LHS>::ResultType >::ResultType     ScalarType;
 +
 +      matrix<ScalarType> temp = expr;
 +      s << temp;
 +      return s;
 +    }
 +    
 +    /** @brief Returns an expression template class representing a transposed matrix */
 +    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                       const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                       op_trans> trans(const matrix<SCALARTYPE, F, ALIGNMENT> & mat)
 +    {
 +      return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                const matrix<SCALARTYPE, F, ALIGNMENT>,
 +                                op_trans>(mat, mat);
 +    }
 +    
 +    
 +    /////////////////////// transfer operations: //////////////////////////////////////
 +
 +    //
 +    //cpu to gpu, generic type:
 +    //
 +    /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
 +    *
 +    * @param cpu_matrix   A dense matrix on the host. Type requirements: .size1() returns number of rows, .size2() returns number of columns. Access to entries via operator()
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    */
 +    template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    void copy(const CPU_MATRIX & cpu_matrix,
 +              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
 +    {
 +      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
 +      
 +      //std::cout << "Copying CPU_MATRIX!" << std::endl;
 +      //std::cout << "Size at begin: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
 +      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
 +      {
 +        gpu_matrix.resize(cpu_matrix.size1(),
 +                          cpu_matrix.size2(), false);
 +      }
 +      else
 +      {
 +        assert( (gpu_matrix.size1() == cpu_matrix.size1()) 
 +               && (gpu_matrix.size2() == cpu_matrix.size2())
 +              );
 +      }
 +
 +      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
 +      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +      {
 +        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
 +      }
 +      
 +      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
 +      //std::cout << "Size at end: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
 +    }
 +    
 +    //
 +    //cpu to gpu, STL type:
 +    //
 +    /** @brief Copies a dense STL-type matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
 +    *
 +    * @param cpu_matrix   A dense matrix on the host of type std::vector< std::vector<> >. cpu_matrix[i][j] returns the element in the i-th row and j-th columns (both starting with zero)
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    */
 +    template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
 +    void copy(const std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix,
 +              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
 +    {
 +      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
 +      
 +      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
 +      {
 +        gpu_matrix.resize(cpu_matrix.size(),
 +                          cpu_matrix[0].size(),
 +                          false);
 +      }
 +      else
 +      {
 +        assert( (gpu_matrix.size1() == cpu_matrix.size()) 
 +               && (gpu_matrix.size2() == cpu_matrix[0].size())
 +              );
 +      }
 +
 +      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
 +      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +      {
 +        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
 +      }
 +      
 +      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
 +    }
 +    
 +    
 +    //
 +    //cpu to gpu, another STL type:
 +    //
 +    /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU) without temporary. Matrix-Layout on CPU must be equal to the matrix-layout on the GPU.
 +    *
 +    * @param cpu_matrix_begin   Pointer to the first matrix entry. Cf. iterator concept in STL
 +    * @param cpu_matrix_end     Pointer past the last matrix entry. Cf. iterator concept in STL
 +    * @param gpu_matrix         A dense ViennaCL matrix
 +    */
 +    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    void fast_copy(SCALARTYPE * cpu_matrix_begin,
 +                   SCALARTYPE * cpu_matrix_end,
 +                   matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
 +    {
 +      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
 +                                                                            sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin),
 +                                                                            cpu_matrix_begin);
 +    }
 +    
 +   
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
 +    *
 +    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    */
 +    template <typename F, unsigned int ALIGNMENT>
 +    void copy(const Eigen::MatrixXf & cpu_matrix,
 +              matrix<float, F, ALIGNMENT> & gpu_matrix)
 +    {
 +      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
 +      
 +      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
 +      {
 +        gpu_matrix.resize(cpu_matrix.rows(),
 +                          cpu_matrix.cols(),
 +                          false);
 +      }
 +      else
 +      {
 +        assert( (gpu_matrix.size1() == static_cast<std::size_t>(cpu_matrix.rows())) 
 +               && (gpu_matrix.size2() == static_cast<std::size_t>(cpu_matrix.cols()))
 +              );
 +      }
 +
 +      std::vector<float> data(gpu_matrix.internal_size());
 +      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +      {
 +        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
 +      }
 +      
 +      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
 +    }
 +    
 +    /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
 +    *
 +    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    */
 +    template <typename F, unsigned int ALIGNMENT>
 +    void copy(const Eigen::MatrixXd & cpu_matrix,
 +              matrix<double, F, ALIGNMENT> & gpu_matrix)
 +    {
 +      typedef typename matrix<double, F, ALIGNMENT>::size_type      size_type;
 +      
 +      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
 +      {
 +        gpu_matrix.resize(cpu_matrix.rows(),
 +                          cpu_matrix.cols(),
 +                          false);
 +      }
 +      else
 +      {
 +        assert( (gpu_matrix.size1() == static_cast<std::size_t>(cpu_matrix.rows())) 
 +               && (gpu_matrix.size2() == static_cast<std::size_t>(cpu_matrix.cols()))
 +              );
 +      }
 +
 +      std::vector<double> data(gpu_matrix.internal_size());
 +      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +      {
 +        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
 +      }
 +      
 +      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
 +    }
 +    #endif
 +    
 +    #ifdef VIENNACL_HAVE_MTL4
 +    /** @brief Copies a dense MTL matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
 +    *
 +    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    */
 +    template <typename SCALARTYPE, typename T, typename F, unsigned int ALIGNMENT>
 +    void copy(const mtl::dense2D<SCALARTYPE, T>& cpu_matrix,
 +              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
 +    {
 +      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
 +      
 +      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
 +      {
 +        gpu_matrix.resize(cpu_matrix.num_rows(),
 +                          cpu_matrix.num_cols(),
 +                          false);
 +      }
 +      else
 +      {
 +        assert( (gpu_matrix.size1() == cpu_matrix.num_rows()) 
 +               && (gpu_matrix.size2() == cpu_matrix.num_cols())
 +              );
 +      }
 +
 +      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
 +      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +      {
 +        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
 +      }
 +      
 +      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
 +    }
 +    #endif
 +    
 +    
 +    
 +    
 +    //
 +    //gpu to cpu, generic type
 +    //
 +    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
 +    *
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    * @param cpu_matrix   A dense memory on the host. Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
 +    */
 +    template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
 +              CPU_MATRIX & cpu_matrix )
 +    {
 +      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
 +      
 +      if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
 +      {
 +        std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        
 +        //now copy entries to cpu_matrix:
 +        for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +          for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +            cpu_matrix(i,j) = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
 +      }
 +    }
 +
 +    //gpu to cpu, STL type
 +    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
 +    *
 +    * @param gpu_matrix   A dense ViennaCL matrix
 +    * @param cpu_matrix   A dense memory on the host using STL types, typically std::vector< std::vector<> > Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
 +    */
 +    template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
 +    void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
 +              std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix)
 +    {
 +      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
 +      
 +      if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) 
 +         && (cpu_matrix.size() >= gpu_matrix.size1()) && (cpu_matrix[0].size() >= gpu_matrix.size2()))
 +      {
 +        std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        
 +        //now copy entries to cpu_matrix:
 +        for (size_type i = 0; i < gpu_matrix.size1(); ++i)
 +          for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
 +            cpu_matrix[i][j] = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
 +      }
 +    }
 +
 +    //gpu to cpu, STL type
 +    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
 +    *
 +    * @param gpu_matrix         A dense ViennaCL matrix
 +    * @param cpu_matrix_begin   Pointer to the output memory on the CPU. User must ensure that provided memory is large enough.
 +    */
 +    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    void fast_copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
 +                   SCALARTYPE * cpu_matrix_begin)
 +    {
 +      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                       gpu_matrix.handle().get(), 
 +                                       CL_TRUE, 0,
 +                                       sizeof(SCALARTYPE)*gpu_matrix.internal_size(),
 +                                       cpu_matrix_begin, 0, NULL, NULL);
 +      VIENNACL_ERR_CHECK(err);
 +    }
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +    // outer_prod(v1, v2) * val;
 +    template<typename CPU_SCALAR, typename SCALARTYPE,unsigned int VECTOR_ALIGNMENT>
 +    viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
 +                                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
 +                                                                    op_prod>,
 +                                 const SCALARTYPE,
 +                                 op_prod>  operator*(const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
 +                                                                                        const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
 +                                                                                        op_prod> & proxy,
 +                                                     CPU_SCALAR const & val)
 +    {
 +      return viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
 +                                                                             const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
 +                                                                             op_prod>,
 +                                          const SCALARTYPE,
 +                                          op_prod>(proxy, static_cast<SCALARTYPE>(val));
 +    }
 +
 +    // val * outer_prod(v1, v2);
 +    template <typename CPU_SCALAR, typename SCALARTYPE, unsigned int VA1, unsigned int VA2>
 +    viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
 +                                                                    const viennacl::vector<SCALARTYPE, VA2>,
 +                                                                    op_prod>,
 +                                 const SCALARTYPE,
 +                                 op_prod>  operator*(CPU_SCALAR const & val,
 +                                                     viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
 +                                                                                  const viennacl::vector<SCALARTYPE, VA2>,
 +                                                                                  op_prod> const & proxy)
 +    {
 +      return viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
 +                                                                             const viennacl::vector<SCALARTYPE, VA2>,
 +                                                                             op_prod>,
 +                                          const SCALARTYPE,
 +                                          op_prod>(proxy, static_cast<SCALARTYPE>(val));
 +    }
 +    
 +   
 +
 +} //namespace viennacl
 +
 +#endif
++=======
+ #ifndef VIENNACL_MATRIX_HPP_
+ #define VIENNACL_MATRIX_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/matrix.hpp
+     @brief Implementation of the dense matrix class
+ */
+ 
+ #include "viennacl/forwards.h"
+ #include "viennacl/scalar.hpp"
+ #include "viennacl/vector.hpp"
+ #include "viennacl/linalg/matrix_operations.hpp"
+ #include "viennacl/linalg/sparse_matrix_operations.hpp"
+ #include "viennacl/tools/tools.hpp"
+ #include "viennacl/tools/matrix_size_deducer.hpp"
+ #include "viennacl/meta/result_of.hpp"
+ #include "viennacl/meta/enable_if.hpp"
+ //#include "viennacl/rand/utils.hpp"
+ #include "viennacl/traits/handle.hpp"
+ 
+ namespace viennacl
+ {
+   /** @brief Base class for representing matrices where the individual entries are not all stored explicitly, e.g. identity_matrix<>
+     *
+     * Examples are identity_matrix, scalar_matrix, and zero_matrix.
+     */
+   template<typename SCALARTYPE>
+   class implicit_matrix_base
+   {
+     protected:
+       typedef vcl_size_t        size_type;
+       implicit_matrix_base(size_type size1, size_type size2, std::pair<SCALARTYPE, bool> value, bool diag) : size1_(size1), size2_(size2), value_(value), diag_(diag){ }
+     public:
+       typedef SCALARTYPE const & const_reference;
+       typedef SCALARTYPE cpu_value_type;
+ 
+       size_type size1() const { return size1_; }
+       size_type size2() const { return size2_; }
+ 
+       SCALARTYPE  value() const { return value_.first; }
+       bool is_value_static( ) const { return value_.second; }
+       bool diag() const { return diag_; }
+ 
+       const_reference operator()(size_type i, size_type j) const {
+         if(diag_) return (i == j) ? value_.first : 0;
+         return value_.first;
+       }
+ 
+     protected:
+       size_type size1_;
+       size_type size2_;
+       std::pair<SCALARTYPE, bool> value_;
+       bool diag_;
+   };
+ 
+   //
+   // Initializer types
+   //
+   /** @brief Represents a vector consisting of 1 at a given index and zeros otherwise. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+   template <typename SCALARTYPE>
+   class identity_matrix
+   {
+     public:
+       typedef vcl_size_t         size_type;
+       typedef SCALARTYPE const & const_reference;
+ 
+       identity_matrix(size_type s, viennacl::context ctx = viennacl::context()) : size_(s), diag_(1), off_diag_(0), ctx_(ctx) {}
+ 
+       size_type size1() const { return size_; }
+       size_type size2() const { return size_; }
+       const_reference operator()(size_type i, size_type j) const { return (i == j) ? diag_ : off_diag_; }
+ 
+       viennacl::context context() const { return ctx_; }
+ 
+     private:
+       size_type size_;
+       SCALARTYPE diag_;
+       SCALARTYPE off_diag_;
+       viennacl::context ctx_;
+   };
+ 
+ 
+   /** @brief Represents a vector consisting of zeros only. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+   template <typename SCALARTYPE>
+   class zero_matrix
+   {
+     public:
+       typedef vcl_size_t         size_type;
+       typedef SCALARTYPE const & const_reference;
+ 
+       zero_matrix(size_type s1, size_type s2, viennacl::context ctx = viennacl::context()) : size1_(s1), size2_(s2), val_(0), ctx_(ctx) {}
+ 
+       size_type size1() const { return size1_; }
+       size_type size2() const { return size2_; }
+       const_reference operator()(size_type /*i*/, size_type /*j*/) const { return val_; }
+ 
+       viennacl::context context() const { return ctx_; }
+ 
+     private:
+       size_type size1_;
+       size_type size2_;
+       SCALARTYPE val_;
+       viennacl::context ctx_;
+   };
+ 
+ 
+   /** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+   template <typename SCALARTYPE>
+   class scalar_matrix
+   {
+     public:
+       typedef vcl_size_t         size_type;
+       typedef SCALARTYPE const & const_reference;
+ 
+       scalar_matrix(size_type s1, size_type s2, const_reference val, viennacl::context ctx = viennacl::context()) : size1_(s1), size2_(s2), value_(val), ctx_(ctx) {}
+ 
+       size_type size1() const { return size1_; }
+       size_type size2() const { return size2_; }
+       const_reference operator()(size_type /*i*/, size_type /*j*/) const { return value_; }
+ 
+       viennacl::context context() const { return ctx_; }
+ 
+     private:
+       size_type size1_;
+       size_type size2_;
+       SCALARTYPE value_;
+       viennacl::context ctx_;
+   };
+ 
+ 
+ 
+ //#ifdef VIENNACL_WITH_OPENCL
+ //  template<class SCALARTYPE, class DISTRIBUTION>
+ //  rand::random_matrix_t<SCALARTYPE, DISTRIBUTION> random_matrix(unsigned int size1, unsigned int size2, DISTRIBUTION const & distribution){
+ //      return rand::random_matrix_t<SCALARTYPE,DISTRIBUTION>(size1,size2,distribution);
+ //  }
+ //#endif
+ 
+   /** @brief Expression template class for representing a tree of expressions which ultimately result in a matrix.
+     *
+     * @tparam LHS   The left hand side of the expression tree
+     * @tparam RHS   The right hand side of the expression tree
+     * @tparam OP    The operator to apply to LHS and RHS to obtain the result.
+     */
+   template <typename LHS, typename RHS, typename OP>
+   class matrix_expression
+   {
+       typedef typename viennacl::result_of::reference_if_nonscalar<LHS>::type     lhs_reference_type;
+       typedef typename viennacl::result_of::reference_if_nonscalar<RHS>::type     rhs_reference_type;
+ 
+     public:
+       typedef vcl_size_t       size_type;
+ 
+       matrix_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+ 
+       /** @brief Get left hand side operand
+       */
+       LHS & lhs() const { return lhs_; }
+       /** @brief Get right hand side operand
+       */
+       RHS & rhs() const { return rhs_; }
+ 
+       /** @brief Returns the size of the result vector */
+       vcl_size_t size1() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size1(lhs_, rhs_); }
+       vcl_size_t size2() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size2(lhs_, rhs_); }
+ 
+     private:
+       /** @brief The left hand side operand */
+       lhs_reference_type lhs_;
+       /** @brief The right hand side operand */
+       rhs_reference_type rhs_;
+   };
+ 
+ 
+   /** @brief A tag indicating iteration along increasing row index of a matrix */
+   struct row_iteration {};
+ 
+   /** @brief A tag indicating iteration along increasing columns index of a matrix */
+   struct col_iteration {};
+ 
+   //STL-like iterator. TODO: STL-compliance...
+   /** @brief uBLAS-like iterator class for iterating over the entries of a dense matrix. */
+   template <typename ROWCOL, typename MATRIXTYPE>
+   class matrix_iterator
+   {
+       typedef matrix_iterator<ROWCOL, MATRIXTYPE>    self_type;
+     public:
+       typedef typename MATRIXTYPE::value_type       value_type;
+ 
+       matrix_iterator(MATRIXTYPE & mat,
+                       vcl_size_t start_row,
+                       vcl_size_t start_col) : mat_(mat), row_(start_row), col_(start_col) {}
+ 
+       value_type operator*(void) { return mat_(row_, col_); }
+       self_type & operator++(void) { viennacl::tools::MATRIX_ITERATOR_INCREMENTER<ROWCOL, MATRIXTYPE>::apply(mat_, row_, col_); return *this; }
+       self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
+ 
+       bool operator==(self_type const & other) { return (row_ == other.row_) && (col_ == other.col_); }
+       bool operator!=(self_type const & other) { return !(*this == other); }
+ 
+       vcl_size_t index1() { return row_; }
+       vcl_size_t index2() { return col_; }
+ 
+       MATRIXTYPE & operator()(void) const { return mat_; }
+ 
+     private:
+       MATRIXTYPE & mat_;
+       vcl_size_t row_;
+       vcl_size_t col_;
+   };
+ 
+ 
+   /** @brief A dense matrix class
+   *
+   * @tparam SCALARTYPE   The underlying scalar type (either float or double)
+   * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
+   * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+   */
+   template <class SCALARTYPE, typename F, typename SizeType /* see forwards.h for default type */, typename DistanceType /* see forwards.h for default type */>
+   class matrix_base
+   {
+       typedef matrix_base<SCALARTYPE, F, SizeType, DistanceType>          self_type;
+     public:
+ 
+       typedef matrix_iterator<row_iteration, self_type >   iterator1;
+       typedef matrix_iterator<col_iteration, self_type >   iterator2;
+       typedef scalar<SCALARTYPE>                                                  value_type;
+       typedef SCALARTYPE                                                          cpu_value_type;
+       typedef SizeType                                                            size_type;
+       typedef DistanceType                                                        difference_type;
+       typedef viennacl::backend::mem_handle                                       handle_type;
+       typedef F                                                                   orientation_functor;
+       typedef typename F::orientation_category                                    orientation_category;
+ 
+       static const size_type alignment = 128;
+ 
+ 
+       /** @brief The default constructor. Does not allocate any memory. */
+       explicit matrix_base() : size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0) {}
+ 
+       /** @brief Creates the matrix with the given dimensions
+       *
+       * @param rows     Number of rows
+       * @param columns  Number of columns
+       * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+       */
+       explicit matrix_base(size_type rows, size_type columns, viennacl::context ctx = viennacl::context())
+           : size1_(rows), size2_(columns), start1_(0), start2_(0), stride1_(1), stride2_(1),
+             internal_size1_(viennacl::tools::align_to_multiple<size_type>(rows, alignment)),
+             internal_size2_(viennacl::tools::align_to_multiple<size_type>(columns, alignment))
+       {
+         if (rows > 0 && columns > 0)
+         {
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), ctx);
+           clear();
+         }
+       }
+ 
+ 
+       /** @brief Constructor for creating a matrix_range or matrix_stride from some other matrix/matrix_range/matrix_stride */
+       explicit matrix_base(viennacl::backend::mem_handle & h,
+                            size_type mat_size1, size_type mat_start1, difference_type mat_stride1, size_type mat_internal_size1,
+                            size_type mat_size2, size_type mat_start2, difference_type mat_stride2, size_type mat_internal_size2)
+         : size1_(mat_size1), size2_(mat_size2),
+           start1_(mat_start1), start2_(mat_start2),
+           stride1_(mat_stride1), stride2_(mat_stride2),
+           internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2),
+           elements_(h) {}
+ 
+       template <typename LHS, typename RHS, typename OP>
+       explicit matrix_base(matrix_expression<const LHS, const RHS, OP> const & proxy) :
+         size1_(viennacl::traits::size1(proxy)), size2_(viennacl::traits::size2(proxy)), start1_(0), start2_(0), stride1_(1), stride2_(1),
+         internal_size1_(viennacl::tools::align_to_multiple<size_type>(size1_, alignment)),
+         internal_size2_(viennacl::tools::align_to_multiple<size_type>(size2_, alignment))
+       {
+         elements_.switch_active_handle_id(viennacl::traits::active_handle_id(proxy));
+         if (internal_size() > 0)
+         {
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+           clear();
+           self_type::operator=(proxy);
+         }
+       }
+ 
+       // CUDA or host memory:
+       explicit matrix_base(SCALARTYPE * ptr_to_mem, viennacl::memory_types mem_type,
+                            size_type mat_size1, size_type mat_start1, difference_type mat_stride1, size_type mat_internal_size1,
+                            size_type mat_size2, size_type mat_start2, difference_type mat_stride2, size_type mat_internal_size2)
+         : size1_(mat_size1), size2_(mat_size2),
+           start1_(mat_start1), start2_(mat_start2),
+           stride1_(mat_stride1), stride2_(mat_stride2),
+           internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2)
+       {
+         if (mem_type == viennacl::CUDA_MEMORY)
+         {
+ #ifdef VIENNACL_WITH_CUDA
+           elements_.switch_active_handle_id(viennacl::CUDA_MEMORY);
+           elements_.cuda_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+           elements_.cuda_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+ #else
+           throw cuda_not_available_exception();
+ #endif
+         }
+         else if (mem_type == viennacl::MAIN_MEMORY)
+         {
+           elements_.switch_active_handle_id(viennacl::MAIN_MEMORY);
+           elements_.ram_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+           elements_.ram_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+         }
+ 
+         elements_.raw_size(sizeof(SCALARTYPE) * internal_size());
+       }
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+       explicit matrix_base(cl_mem mem, size_type rows, size_type columns, viennacl::context ctx = viennacl::context())
+         : size1_(rows), size2_(columns),
+           start1_(0), start2_(0),
+           stride1_(1), stride2_(1),
+           internal_size1_(rows), internal_size2_(columns)
+       {
+         elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+         elements_.opencl_handle() = mem;
+         elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+         elements_.opencl_handle().context(ctx.opencl_context());
+         elements_.raw_size(sizeof(SCALARTYPE)*internal_size());
+       }
+ 
+       explicit matrix_base(cl_mem mem, viennacl::context ctx,
+                            size_type mat_size1, size_type mat_start1, difference_type mat_stride1, size_type mat_internal_size1,
+                            size_type mat_size2, size_type mat_start2, difference_type mat_stride2, size_type mat_internal_size2)
+         : size1_(mat_size1), size2_(mat_size2),
+           start1_(mat_start1), start2_(mat_start2),
+           stride1_(mat_stride1), stride2_(mat_stride2),
+           internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2)
+       {
+         elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+         elements_.opencl_handle() = mem;
+         elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+         elements_.opencl_handle().context(ctx.opencl_context());
+         elements_.raw_size(sizeof(SCALARTYPE)*internal_size());
+       }
+ #endif
+ 
+ 
+       self_type & operator=(const self_type & other)  //enables implicit conversions
+       {
+         if (internal_size() == 0)
+         {
+           if (other.internal_size() == 0)
+             return *this;
+           resize(other.size1(), other.size2(), false);
+         }
+ 
+         viennacl::linalg::am(*this,
+                              other, cpu_value_type(1.0), 1, false, false);
+         return *this;
+       }
+ 
+       /** @brief Creates the matrix from the supplied random matrix. */
+       /*template<class DISTRIBUTION>
+       matrix(rand::random_matrix_t<SCALARTYPE, DISTRIBUTION> const & m) : rows_(m.size1), columns_(m.size2)
+       {
+         if (internal_size() > 0)
+         {
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size());
+           rand::buffer_dumper<SCALARTYPE, DISTRIBUTION>::dump(elements_,m.distribution,0,internal_size());
+         }
+       }*/
+ 
+ 
+ 
+       /** @brief Implementation of the operation m1 = m2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+       *
+       * @param proxy  An expression template proxy class.
+       */
+       template <typename LHS, typename RHS, typename OP>
+       self_type & operator=(const matrix_expression<const LHS, const RHS, OP> & proxy)
+       {
+         assert(  (viennacl::traits::size1(proxy) == size1() || size1() == 0)
+               && (viennacl::traits::size2(proxy) == size2() || size2() == 0)
+               && bool("Incompatible matrix sizes!"));
+ 
+         if (internal_size() == 0 && viennacl::traits::size1(proxy) > 0 && viennacl::traits::size2(proxy) > 0)
+         {
+           size1_ = viennacl::traits::size1(proxy);
+           size2_ = viennacl::traits::size2(proxy);
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+           if (size1_ != internal_size1_ || size2_ != internal_size2_)
+             clear();
+         }
+ 
+         if (internal_size() > 0)
+           linalg::detail::op_executor<self_type, op_assign, matrix_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+ 
+         return *this;
+       }
+ 
+ 
+       // A = trans(B). Currently achieved in CPU memory
+       self_type & operator=(const matrix_expression< const self_type,
+                                                      const self_type,
+                                                      op_trans> & proxy)
+       {
+         assert( (handle() != proxy.lhs().handle()) && bool("Self-assignment of matrix transpose not implemented"));
+         assert( ( (proxy.lhs().size1() == size2()) || (size2() == 0) ) && bool("Matrix dimensions do not match!"));
+         assert( ( (proxy.lhs().size2() == size1()) || (size1() == 0) ) && bool("Matrix dimensions do not match!"));
+ 
+         if (internal_size() == 0 && viennacl::traits::size1(proxy) > 0 && viennacl::traits::size2(proxy) > 0)
+         {
+           size1_ = viennacl::traits::size1(proxy);
+           size2_ = viennacl::traits::size2(proxy);
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+         }
+ 
+         std::vector<SCALARTYPE> temp(proxy.lhs().internal_size());
+ 
+         viennacl::backend::memory_read(proxy.lhs().handle(), 0, sizeof(SCALARTYPE)*proxy.lhs().internal_size(), &(temp[0]));
+ 
+         // now transpose it
+         std::vector<SCALARTYPE> temp_trans(internal_size());
+ 
+         for (vcl_size_t i=0; i<proxy.lhs().size1(); ++i)
+           for (vcl_size_t j=0; j<proxy.lhs().size2(); ++j)
+             temp_trans[F::mem_index(start2() + stride2() * j,
+                                     start1() + stride1() * i,
+                                     internal_size1(), internal_size2())]
+               = temp[F::mem_index(proxy.lhs().start1() + proxy.lhs().stride1() * i,
+                                   proxy.lhs().start2() + proxy.lhs().stride2() * j,
+                                   proxy.lhs().internal_size1(), proxy.lhs().internal_size2())];
+ 
+         // write back
+         viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy), &(temp_trans[0]));
+ 
+         return *this;
+       }
+ 
+       template <typename LHS, typename RHS, typename OP>
+       self_type & operator+=(const matrix_expression<const LHS, const RHS, OP> & proxy)
+       {
+         assert(  (viennacl::traits::size1(proxy) == size1())
+               && (viennacl::traits::size2(proxy) == size2())
+               && bool("Incompatible matrix sizes!"));
+         assert( (size1() > 0) && bool("Vector not yet initialized!") );
+         assert( (size2() > 0) && bool("Vector not yet initialized!") );
+ 
+         linalg::detail::op_executor<self_type, op_inplace_add, matrix_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+ 
+         return *this;
+       }
+ 
+       template <typename LHS, typename RHS, typename OP>
+       self_type & operator-=(const matrix_expression<const LHS, const RHS, OP> & proxy)
+       {
+         assert(  (viennacl::traits::size1(proxy) == size1())
+               && (viennacl::traits::size2(proxy) == size2())
+               && bool("Incompatible matrix sizes!"));
+         assert( (size1() > 0) && bool("Vector not yet initialized!") );
+         assert( (size2() > 0) && bool("Vector not yet initialized!") );
+ 
+         linalg::detail::op_executor<self_type, op_inplace_sub, matrix_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+ 
+         return *this;
+       }
+ 
+       /** @brief Assigns the supplied identity matrix to the matrix. */
+       self_type & operator = (identity_matrix<SCALARTYPE> const & m)
+       {
+         assert( (m.size1() == size1_ || size1_ == 0) && bool("Size mismatch!") );
+         assert( (m.size2() == size2_ || size2_ == 0) && bool("Size mismatch!") );
+ 
+         if (internal_size() == 0)
+         {
+           size1_ = m.size1();
+           size2_ = m.size2();
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+           if (internal_size() > 0)
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), m.context());
+             clear();
+           }
+         }
+         else
+           viennacl::linalg::matrix_assign(*this, SCALARTYPE(0));
+ 
+         if (internal_size() > 0)
+           viennacl::linalg::matrix_diagonal_assign(*this, m(0,0));
+ 
+         return *this;
+       }
+ 
+       /** @brief Assigns the supplied zero matrix to the matrix. */
+       self_type & operator = (zero_matrix<SCALARTYPE> const & m)
+       {
+         assert( (m.size1() == size1_ || size1_ == 0) && bool("Size mismatch!") );
+         assert( (m.size2() == size2_ || size2_ == 0) && bool("Size mismatch!") );
+ 
+         if (internal_size() == 0)
+         {
+           size1_ = m.size1();
+           size2_ = m.size2();
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+           if (internal_size() > 0)
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), m.context());
+             clear();
+           }
+         }
+         else
+           viennacl::linalg::matrix_assign(*this, SCALARTYPE(0));
+ 
+         return *this;
+       }
+ 
+       /** @brief Assigns the supplied scalar vector to the matrix. */
+       self_type & operator = (scalar_matrix<SCALARTYPE> const & m)
+       {
+         assert( (m.size1() == size1_ || size1_ == 0) && bool("Size mismatch!") );
+         assert( (m.size2() == size2_ || size2_ == 0) && bool("Size mismatch!") );
+ 
+         if (internal_size() == 0)
+         {
+           size1_ = m.size1();
+           size2_ = m.size2();
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+           if (internal_size() > 0)
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), m.context());
+             clear();
+           }
+         }
+ 
+         if (internal_size() > 0)
+         {
+           viennacl::linalg::matrix_assign(*this, m(0,0));
+         }
+ 
+         return *this;
+       }
+ 
+ 
+       //read-write access to an element of the matrix/matrix_range/matrix_slice
+       /** @brief Read-write access to a single element of the matrix/matrix_range/matrix_slice
+       */
+       entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index)
+       {
+         return entry_proxy<SCALARTYPE>(F::mem_index(start1_ + stride1_ * row_index, start2_ + stride2_ * col_index, internal_size1(), internal_size2()), elements_);
+       }
+ 
+       /** @brief Read access to a single element of the matrix/matrix_range/matrix_slice
+       */
+       const_entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index) const
+       {
+         return const_entry_proxy<SCALARTYPE>(F::mem_index(start1_ + stride1_ * row_index, start2_ + stride2_ * col_index, internal_size1(), internal_size2()), elements_);
+       }
+ 
+       //
+       // Operator overloads for enabling implicit conversions:
+       //
+       self_type & operator += (const self_type & other)
+       {
+         viennacl::linalg::ambm(*this,
+                                 *this, SCALARTYPE(1.0), 1, false, false,
+                                 other, SCALARTYPE(1.0), 1, false, false);
+         return *this;
+       }
+ 
+       self_type & operator -= (const self_type & other)
+       {
+         viennacl::linalg::ambm(*this,
+                                 *this, SCALARTYPE(1.0), 1, false, false,
+                                 other, SCALARTYPE(1.0), 1, false, true);
+         return *this;
+       }
+ 
+       /** @brief Scales a matrix by a CPU scalar value
+       */
+       self_type & operator *= (SCALARTYPE val)
+       {
+         //viennacl::linalg::inplace_mult(*this, val);
+         viennacl::linalg::am(*this,
+                               *this, val, 1, false, false);
+         return *this;
+       }
+ 
+       /** @brief Scales this matrix by a CPU scalar value
+       */
+       self_type & operator /= (SCALARTYPE val)
+       {
+         //viennacl::linalg::inplace_mult(*this, static_cast<SCALARTYPE>(1) / val);
+         viennacl::linalg::am(*this,
+                               *this, val, 1, true, false);
+         return *this;
+       }
+ 
+ 
+       /** @brief Sign flip for the matrix. Emulated to be equivalent to -1.0 * matrix */
+       matrix_expression<const self_type, const SCALARTYPE, op_mult> operator-() const
+       {
+         return matrix_expression<const self_type, const SCALARTYPE, op_mult>(*this, SCALARTYPE(-1));
+       }
+ 
+       /** @brief Returns the number of rows */
+       size_type size1() const { return size1_;}
+       /** @brief Returns the number of columns */
+       size_type size2() const { return size2_; }
+ 
+       /** @brief Returns the number of rows */
+       size_type start1() const { return start1_;}
+       /** @brief Returns the number of columns */
+       size_type start2() const { return start2_; }
+ 
+       /** @brief Returns the number of rows */
+       size_type stride1() const { return stride1_;}
+       /** @brief Returns the number of columns */
+       size_type stride2() const { return stride2_; }
+ 
+       /** @brief Resets all entries to zero */
+       void clear()
+       {
+         viennacl::linalg::matrix_assign(*this, SCALARTYPE(0), true);
+       }
+ 
+ 
+       /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
+       size_type internal_size1() const { return internal_size1_; }
+       /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
+       size_type internal_size2() const { return internal_size2_; }
+       /** @brief Returns the total amount of allocated memory in multiples of sizeof(SCALARTYPE) */
+       size_type internal_size() const { return internal_size1() * internal_size2(); }
+ 
+       /** @brief Returns the OpenCL handle, non-const-version */
+             handle_type & handle()       { return elements_; }
+       /** @brief Returns the OpenCL handle, const-version */
+       const handle_type & handle() const { return elements_; }
+ 
+ 
+       viennacl::memory_types memory_domain() const
+       {
+         return elements_.get_active_handle_id();
+       }
+ 
+     protected:
+ 
+       void set_handle(viennacl::backend::mem_handle const & h)
+       {
+         elements_ = h;
+       }
+ 
+       void switch_memory_context(viennacl::context new_ctx)
+       {
+         viennacl::backend::switch_memory_context<SCALARTYPE>(elements_, new_ctx);
+       }
+ 
+ 
+       /** @brief Resizes the matrix.
+       *   Existing entries can be preserved, but
+       *
+       * @param rows       New number of rows
+       * @param columns    New number of columns
+       * @param preserve   If true, existing values are preserved.
+       */
+       void resize(size_type rows, size_type columns, bool preserve = true)
+       {
+         assert( (rows > 0 && columns > 0) && bool("Check failed in matrix::resize(): Number of rows and columns must be positive!"));
+ 
+         if (preserve && internal_size() > 0)
+         {
+           //get old entries:
+           std::vector< SCALARTYPE > old_entries(internal_size());
+           viennacl::backend::memory_read(elements_, 0, sizeof(SCALARTYPE)*internal_size(), &(old_entries[0]));
+ 
+           //set up entries of new matrix:
+           std::vector< SCALARTYPE > new_entries(  viennacl::tools::align_to_multiple<vcl_size_t>(rows,    alignment)
+                                                 * viennacl::tools::align_to_multiple<vcl_size_t>(columns, alignment));
+           for (size_type i=0; i<rows; ++i)
+           {
+             if (i >= size1_)
+               continue;
+ 
+             for (size_type j=0; j<columns; ++j)
+             {
+               if (j >= size2_)
+                 continue;
+               new_entries[F::mem_index(i, j, viennacl::tools::align_to_multiple<vcl_size_t>(rows, alignment), viennacl::tools::align_to_multiple<vcl_size_t>(columns, alignment))]
+                   = old_entries[F::mem_index(i, j, internal_size1(), internal_size2())];
+             }
+           }
+ 
+           //copy new entries to GPU:
+           size1_ = rows;
+           size2_ = columns;
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*new_entries.size(), viennacl::traits::context(elements_), &(new_entries[0]));
+         }
+         else //discard old entries:
+         {
+           size1_ = rows;
+           size2_ = columns;
+           internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+           internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+ 
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(elements_));
+           clear();
+         }
+       }
+ 
+     private:
+       size_type size1_;
+       size_type size2_;
+       size_type start1_;
+       size_type start2_;
+       difference_type stride1_;
+       difference_type stride2_;
+       size_type internal_size1_;
+       size_type internal_size2_;
+       handle_type elements_;
+   }; //matrix
+ 
+ 
+ 
+   /** @brief A dense matrix class
+   *
+   * @tparam SCALARTYPE   The underlying scalar type (either float or double)
+   * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
+   * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+   */
+   template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
+   class matrix : public matrix_base<SCALARTYPE, F>
+   {
+       typedef matrix<SCALARTYPE, F, ALIGNMENT>          self_type;
+       typedef matrix_base<SCALARTYPE, F>                base_type;
+     public:
+       typedef typename base_type::size_type             size_type;
+ 
+       /** @brief The default constructor. Does not allocate any memory. */
+       explicit matrix() : base_type() {}
+ 
+       /** @brief Creates the matrix with the given dimensions
+       *
+       * @param rows     Number of rows
+       * @param columns  Number of columns
+       * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+       */
+       explicit matrix(size_type rows, size_type columns, viennacl::context ctx = viennacl::context()) : base_type(rows, columns, ctx) {}
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+       explicit matrix(cl_mem mem, size_type rows, size_type columns) : base_type(mem, rows, columns) {}
+ #endif
+ 
+       template <typename LHS, typename RHS, typename OP>
+       matrix(matrix_expression< LHS, RHS, OP> const & proxy) : base_type(proxy) {}
+ 
+       /** @brief Creates the matrix from the supplied identity matrix. */
+       matrix(identity_matrix<SCALARTYPE> const & m) : base_type(m.size1(), m.size2(), m.context())
+       {
+         if (base_type::internal_size() > 0)
+           base_type::operator=(m);
+       }
+ 
+       /** @brief Creates the matrix from the supplied zero matrix. */
+       matrix(zero_matrix<SCALARTYPE> const & m) : base_type(m.size1(), m.size2(), m.context())
+       {
+         if (base_type::internal_size() > 0)
+           base_type::operator=(m);
+       }
+ 
+       /** @brief Creates the matrix from the supplied scalar matrix. */
+       matrix(scalar_matrix<SCALARTYPE> const & m) : base_type(m.size1(), m.size2(), m.context())
+       {
+         if (base_type::internal_size() > 0)
+           base_type::operator=(m);
+       }
+ 
+       matrix(const base_type & other) : base_type(other.size1(), other.size2(), viennacl::traits::context(other))
+       {
+         base_type::operator=(other);
+       }
+ 
+ 
+       //copy constructor:
+       matrix(const self_type & other) : base_type(other.size1(), other.size2(), viennacl::traits::context(other))
+       {
+         base_type::operator=(other);
+       }
+ 
+ 
+       /*template <typename M1>
+       self_type & operator=(const matrix_expression< const M1, const M1, op_trans> & proxy)
+       {
+         self_type temp(proxy.lhs());
+         *this = trans(temp);
+         return *this;
+       }*/
+ 
+       using base_type::operator=;
+ 
+       /** @brief Resizes the matrix.
+       *   Existing entries can optionally be preserved
+       *
+       * @param rows       New number of rows
+       * @param columns    New number of columns
+       * @param preserve   If true, existing values are preserved.
+       */
+       void resize(size_type rows, size_type columns, bool preserve = true)
+       {
+         base_type::resize(rows, columns, preserve);
+       }
+ 
+   }; //matrix
+ 
+ 
+ 
+   /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+   *
+   * @param s            STL output stream
+   * @param gpu_matrix   A dense ViennaCL matrix
+   */
+   template<class SCALARTYPE, typename F>
+   std::ostream & operator<<(std::ostream & s, const matrix_base<SCALARTYPE, F> & gpu_matrix)
+   {
+     typedef typename matrix_base<SCALARTYPE, F>::size_type      size_type;
+ 
+     std::vector<SCALARTYPE> tmp(gpu_matrix.internal_size());
+     viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &(tmp[0]));
+ 
+     s << "[" << gpu_matrix.size1() << "," << gpu_matrix.size2() << "]";
+ 
+     s << "(";
+     for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+     {
+       s << "(";
+       for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+       {
+         s << tmp[F::mem_index(i * gpu_matrix.stride1() + gpu_matrix.start1(), j * gpu_matrix.stride2() + gpu_matrix.start2(), gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
+         if (j < gpu_matrix.size2() - 1)
+           s << ",";
+       }
+       s << ")";
+       if (i < gpu_matrix.size1() - 1)
+         s << ",";
+     }
+     s << ")";
+     return s;
+   }
+ 
+   /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+   *
+   * @param s            STL output stream
+   * @param expr         A matrix expression
+   */
+   template<typename LHS, typename RHS, typename OP>
+   std::ostream & operator<<(std::ostream & s, const matrix_expression<LHS, RHS, OP> & expr)
+   {
+     typedef typename viennacl::tools::CPU_SCALAR_TYPE_DEDUCER< typename tools::CONST_REMOVER<LHS>::ResultType >::ResultType     ScalarType;
+ 
+     matrix<ScalarType> temp = expr;
+     s << temp;
+     return s;
+   }
+ 
+   /** @brief Returns an expression template class representing a transposed matrix */
+   template<typename NumericT, typename F>
+   matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>
+   trans(const matrix_base<NumericT, F> & mat)
+   {
+     return matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>(mat, mat);
+   }
+ 
+   //diag():
+   template<typename NumericT, typename F>
+   vector_expression< const matrix_base<NumericT, F>, const int, op_matrix_diag>
+   diag(const matrix_base<NumericT, F> & A, int k = 0)
+   {
+     return vector_expression< const matrix_base<NumericT, F>, const int, op_matrix_diag>(A, k);
+   }
+ 
+   template<typename NumericT>
+   matrix_expression< const vector_base<NumericT>, const int, op_vector_diag>
+   diag(const vector_base<NumericT> & v, int k = 0)
+   {
+     return matrix_expression< const vector_base<NumericT>, const int, op_vector_diag>(v, k);
+   }
+ 
+   // row():
+   template<typename NumericT, typename F>
+   vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_row>
+   row(const matrix_base<NumericT, F> & A, unsigned int i)
+   {
+     return vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_row>(A, i);
+   }
+ 
+   // column():
+   template<typename NumericT, typename F>
+   vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_column>
+   column(const matrix_base<NumericT, F> & A, unsigned int j)
+   {
+     return vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_column>(A, j);
+   }
+ 
+   /////////////////////// transfer operations: //////////////////////////////////////
+ 
+   //
+   //cpu to gpu, generic type:
+   //
+   /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+   *
+   * @param cpu_matrix   A dense matrix on the host. Type requirements: .size1() returns number of rows, .size2() returns number of columns. Access to entries via operator()
+   * @param gpu_matrix   A dense ViennaCL matrix
+   */
+   template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+   void copy(const CPU_MATRIX & cpu_matrix,
+             matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
+   {
+     typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
+ 
+     //std::cout << "Copying CPU_MATRIX!" << std::endl;
+     //std::cout << "Size at begin: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
+     if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+     {
+       gpu_matrix.resize(cpu_matrix.size1(),
+                         cpu_matrix.size2(), false);
+     }
+ 
+     assert( (gpu_matrix.size1() == cpu_matrix.size1()) && (gpu_matrix.size2() == cpu_matrix.size2()) && bool("Matrix dimensions mismatch.") );
+ 
+     std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
+     for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+     {
+       for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+         data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
+     }
+ 
+     viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+     //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+     //std::cout << "Size at end: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
+   }
+ 
+   //
+   //cpu to gpu, STL type:
+   //
+   /** @brief Copies a dense STL-type matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+   *
+   * @param cpu_matrix   A dense matrix on the host of type std::vector< std::vector<> >. cpu_matrix[i][j] returns the element in the i-th row and j-th columns (both starting with zero)
+   * @param gpu_matrix   A dense ViennaCL matrix
+   */
+   template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
+   void copy(const std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix,
+             matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
+   {
+     typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
+ 
+     if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+     {
+       gpu_matrix.resize(cpu_matrix.size(),
+                         cpu_matrix[0].size(),
+                         false);
+     }
+ 
+     assert( (gpu_matrix.size1() == cpu_matrix.size()) && bool("Matrix dimensions mismatch.") );
+ 
+     std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
+     for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+     {
+       assert( (gpu_matrix.size2() == cpu_matrix[i].size()) && bool("Matrix dimensions mismatch.") );
+ 
+       for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+         data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
+     }
+ 
+     viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+     //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+   }
+ 
+ 
+   //
+   //cpu to gpu, another STL type:
+   //
+   /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU) without temporary. Matrix-Layout on CPU must be equal to the matrix-layout on the GPU.
+   *
+   * @param cpu_matrix_begin   Pointer to the first matrix entry. Cf. iterator concept in STL
+   * @param cpu_matrix_end     Pointer past the last matrix entry. Cf. iterator concept in STL
+   * @param gpu_matrix         A dense ViennaCL matrix
+   */
+   template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+   void fast_copy(SCALARTYPE * cpu_matrix_begin,
+                   SCALARTYPE * cpu_matrix_end,
+                   matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
+   {
+     viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin), viennacl::traits::context(gpu_matrix), cpu_matrix_begin);
+     /*gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
+                                                                           sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin),
+                                                                           cpu_matrix_begin);*/
+   }
+ 
+ 
+   #ifdef VIENNACL_WITH_EIGEN
+   /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+   *
+   * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
+   * @param gpu_matrix   A dense ViennaCL matrix
+   */
+   template <typename F, unsigned int ALIGNMENT>
+   void copy(const Eigen::MatrixXf & cpu_matrix,
+             matrix<float, F, ALIGNMENT> & gpu_matrix)
+   {
+     typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
+ 
+     if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+     {
+       gpu_matrix.resize(cpu_matrix.rows(),
+                         cpu_matrix.cols(),
+                         false);
+     }
+     else
+     {
+       assert( (gpu_matrix.size1() == static_cast<vcl_size_t>(cpu_matrix.rows()))
+               && (gpu_matrix.size2() == static_cast<vcl_size_t>(cpu_matrix.cols()))
+               && bool("matrix size mismatch")
+             );
+     }
+ 
+     std::vector<float> data(gpu_matrix.internal_size());
+     for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+     {
+       for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+         data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
+     }
+ 
+     viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(float) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+     //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+   }
+ 
+   /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+   *
+   * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
+   * @param gpu_matrix   A dense ViennaCL matrix
+   */
+   template <typename F, unsigned int ALIGNMENT>
+   void copy(const Eigen::MatrixXd & cpu_matrix,
+             matrix<double, F, ALIGNMENT> & gpu_matrix)
+   {
+     typedef typename matrix<double, F, ALIGNMENT>::size_type      size_type;
+ 
+     if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+     {
+       gpu_matrix.resize(cpu_matrix.rows(),
+                         cpu_matrix.cols(),
+                         false);
+     }
+     else
+     {
+       assert( (gpu_matrix.size1() == static_cast<vcl_size_t>(cpu_matrix.rows()))
+               && (gpu_matrix.size2() == static_cast<vcl_size_t>(cpu_matrix.cols()))
+               && bool("matrix size mismatch")
+             );
+     }
+ 
+     std::vector<double> data(gpu_matrix.internal_size());
+     for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+     {
+       for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+         data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
+     }
+ 
+     viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(double) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+     //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+   }
+   #endif
+ 
+   #ifdef VIENNACL_WITH_MTL4
+   /** @brief Copies a dense MTL matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+   *
+   * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
+   * @param gpu_matrix   A dense ViennaCL matrix
+   */
+   template <typename SCALARTYPE, typename T, typename F, unsigned int ALIGNMENT>
+   void copy(const mtl::dense2D<SCALARTYPE, T>& cpu_matrix,
+             matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
+   {
+     typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
+ 
+     if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+     {
+       gpu_matrix.resize(cpu_matrix.num_rows(),
+                         cpu_matrix.num_cols(),
+                         false);
+     }
+     else
+     {
+       assert( (gpu_matrix.size1() == cpu_matrix.num_rows())
+               && (gpu_matrix.size2() == cpu_matrix.num_cols())
+               && bool("matrix size mismatch")
+             );
+     }
+ 
+     std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
+     for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+     {
+       for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+         data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
+     }
+ 
+     viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+     //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+   }
+   #endif
+ 
+ 
+ 
+ 
+   //
+   //gpu to cpu, generic type
+   //
+   /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU).
+   *
+   * @param gpu_matrix   A dense ViennaCL matrix
+   * @param cpu_matrix   A dense memory on the host. Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
+   */
+   template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+   void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
+             CPU_MATRIX & cpu_matrix )
+   {
+     typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
+ 
+     if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
+     {
+       assert( viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1() && bool("Matrix dimensions mismatch: rows"));
+ 
+       std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
+       viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]));
+ 
+       //now copy entries to cpu_matrix:
+       for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+       {
+         assert( viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2() && bool("Matrix dimensions mismatch: columns"));
+         for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+           cpu_matrix(i,j) = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
+       }
+     }
+   }
+ 
+   //gpu to cpu, STL type
+   /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU).
+   *
+   * @param gpu_matrix   A dense ViennaCL matrix
+   * @param cpu_matrix   A dense memory on the host using STL types, typically std::vector< std::vector<> > Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
+   */
+   template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
+   void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
+             std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix)
+   {
+     typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
+ 
+     if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
+     {
+       assert( (cpu_matrix.size() == gpu_matrix.size1()) && bool("Matrix dimensions mismatch: rows"));
+ 
+       std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
+       viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]));
+ 
+       //now copy entries to cpu_matrix:
+       for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+       {
+         assert( (cpu_matrix[i].size() == gpu_matrix.size2()) && bool("Matrix dimensions mismatch: columns"));
+ 
+         for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+           cpu_matrix[i][j] = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
+       }
+     }
+   }
+ 
+   //gpu to cpu, STL type
+   /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU).
+   *
+   * @param gpu_matrix         A dense ViennaCL matrix
+   * @param cpu_matrix_begin   Pointer to the output memory on the CPU. User must ensure that provided memory is large enough.
+   */
+   template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+   void fast_copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
+                   SCALARTYPE * cpu_matrix_begin)
+   {
+     viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), cpu_matrix_begin);
+   }
+ 
+ 
+ 
+   /////////////////////// matrix operator overloads to follow ////////////////////////////////////////////
+ 
+ 
+   // operator +
+   /** @brief Generic 'catch-all' overload, which enforces a temporary if the expression tree gets too deep. */
+   template <typename LHS1, typename RHS1, typename OP1,
+             typename LHS2, typename RHS2, typename OP2>
+   matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                      op_add>
+   operator + (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+               matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+   {
+     assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+             && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+             && bool("Incompatible matrix sizes!"));
+     return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                               const matrix_expression<const LHS2, const RHS2, OP2>,
+                               op_add>(proxy1, proxy2);
+   }
+ 
+   template <typename LHS1, typename RHS1, typename OP1,
+             typename NumericT, typename F>
+   matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                      const matrix_base<NumericT, F>,
+                      op_add>
+   operator + (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+               matrix_base<NumericT, F> const & proxy2)
+   {
+     assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+             && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+             && bool("Incompatible matrix sizes!"));
+     return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                               const matrix_base<NumericT, F>,
+                               op_add>(proxy1, proxy2);
+   }
+ 
+   template <typename NumericT, typename F,
+             typename LHS2, typename RHS2, typename OP2>
+   matrix_expression< const matrix_base<NumericT, F>,
+                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                      op_add>
+   operator + (matrix_base<NumericT, F> const & proxy1,
+               matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+   {
+     assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+             && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+             && bool("Incompatible matrix sizes!"));
+     return  matrix_expression< const matrix_base<NumericT, F>,
+                                const matrix_expression<const LHS2, const RHS2, OP2>,
+                                op_add>(proxy1, proxy2);
+   }
+ 
+   /** @brief Operator overload for m1 + m2, where m1 and m2 are either dense matrices, matrix ranges, or matrix slices. No mixing of different storage layouts allowed at the moment. */
+   template <typename NumericT, typename F>
+   matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_add >
+   operator + (const matrix_base<NumericT, F> & m1, const matrix_base<NumericT, F> & m2)
+   {
+     return matrix_expression< const matrix_base<NumericT, F>,
+                               const matrix_base<NumericT, F>,
+                               op_add > (m1, m2);
+   }
+ 
+ 
+   // operator -
+   template <typename LHS1, typename RHS1, typename OP1,
+             typename LHS2, typename RHS2, typename OP2>
+   matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                      op_sub>
+   operator - (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+               matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+   {
+     assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+             && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+             && bool("Incompatible matrix sizes!"));
+     return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                               const matrix_expression<const LHS2, const RHS2, OP2>,
+                               op_sub>(proxy1, proxy2);
+   }
+ 
+   template <typename LHS1, typename RHS1, typename OP1,
+             typename NumericT, typename F>
+   matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                      const matrix_base<NumericT, F>,
+                      op_sub>
+   operator - (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+               matrix_base<NumericT, F> const & proxy2)
+   {
+     assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+             && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+             && bool("Incompatible matrix sizes!"));
+     return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                               const matrix_base<NumericT, F>,
+                               op_sub>(proxy1, proxy2);
+   }
+ 
+   template <typename NumericT, typename F,
+             typename LHS2, typename RHS2, typename OP2>
+   matrix_expression< const matrix_base<NumericT, F>,
+                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                      op_sub>
+   operator - (matrix_base<NumericT, F> const & proxy1,
+               matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+   {
+     assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+             && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+             && bool("Incompatible matrix sizes!"));
+     return  matrix_expression< const matrix_base<NumericT, F>,
+                                const matrix_expression<const LHS2, const RHS2, OP2>,
+                                op_sub>(proxy1, proxy2);
+   }
+ 
+   /** @brief Operator overload for m1 - m2, where m1 and m2 are either dense matrices, matrix ranges, or matrix slices. No mixing of different storage layouts allowed at the moment. */
+   template <typename NumericT, typename F>
+   matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_sub >
+   operator - (const matrix_base<NumericT, F> & m1, const matrix_base<NumericT, F> & m2)
+   {
+     return matrix_expression< const matrix_base<NumericT, F>,
+                               const matrix_base<NumericT, F>,
+                               op_sub > (m1, m2);
+   }
+ 
+ 
+ 
+   // operator *
+   /** @brief Operator overload for the expression alpha * m1, where alpha is a host scalar (float or double) and m1 is a ViennaCL matrix.
+   *
+   * @param value   The host scalar (float or double)
+   * @param m1      A ViennaCL matrix
+   */
+   template <typename S1, typename NumericT, typename F>
+   typename viennacl::enable_if<    viennacl::is_any_scalar<S1>::value,
+                                 matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult>
+                               >::type
+   operator * (S1 const & value, matrix_base<NumericT, F> const & m1)
+   {
+     return matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult>(m1, value);
+   }
+ 
+ 
+   /** @brief Operator overload for the multiplication of a matrix expression with a scalar from the right, e.g. (beta * m1) * alpha. Here, beta * m1 is wrapped into a matrix_expression and then multiplied with alpha from the right.
+   *
+   * @param proxy   Left hand side matrix expression
+   * @param val     Right hand side scalar
+   */
+   template <typename LHS, typename RHS, typename OP, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult> >::type
+   operator * (matrix_expression< LHS, RHS, OP> const & proxy,
+               S1 const & val)
+   {
+     return matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+   }
+ 
+ 
+   /** @brief Operator overload for the multiplication of a matrix expression with a ViennaCL scalar from the left, e.g. alpha * (beta * m1). Here, beta * m1 is wrapped into a matrix_expression and then multiplied with alpha from the left.
+   *
+   * @param val     Right hand side scalar
+   * @param proxy   Left hand side matrix expression
+   */
+   template <typename S1, typename LHS, typename RHS, typename OP>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult> >::type
+   operator * (S1 const & val,
+               matrix_expression< LHS, RHS, OP> const & proxy)
+   {
+     return matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+   }
+ 
+   /** @brief Scales the matrix by a GPU scalar 'alpha' and returns an expression template
+   */
+   template <typename NumericT, typename F, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult> >::type
+   operator * (matrix_base<NumericT, F> const & m1, S1 const & s1)
+   {
+     return matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult>(m1, s1);
+   }
+ 
+ 
+   // operator *=
+ 
+   /** @brief Scales a matrix by a GPU scalar value
+   */
+   template <typename NumericT, typename F, typename S1>
+   typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                 matrix_base<NumericT, F> &
+                               >::type
+   operator *= (matrix_base<NumericT, F> & m1, S1 const & gpu_val)
+   {
+     //viennacl::linalg::inplace_mult(*this, gpu_val);
+     viennacl::linalg::am(m1,
+                          m1, gpu_val, 1, false, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+     return m1;
+   }
+ 
+ 
+   // operator /
+ 
+ 
+   /** @brief Operator overload for the division of a matrix expression by a scalar from the right, e.g. (beta * m1) / alpha. Here, beta * m1 is wrapped into a matrix_expression and then divided by alpha.
+   *
+   * @param proxy   Left hand side matrix expression
+   * @param val     Right hand side scalar
+   */
+   template <typename LHS, typename RHS, typename OP, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 matrix_expression< const matrix_expression<const LHS, const RHS, OP>, const S1, op_div> >::type
+   operator / (matrix_expression<const LHS, const RHS, OP> const & proxy,
+               S1 const & val)
+   {
+     return matrix_expression< const matrix_expression<const LHS, const RHS, OP>, const S1, op_div>(proxy, val);
+   }
+ 
+ 
+   /** @brief Returns an expression template for scaling the matrix by a GPU scalar 'alpha'
+   */
+   template <typename NumericT, typename F, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 matrix_expression< const matrix_base<NumericT, F>, const S1, op_div> >::type
+   operator / (matrix_base<NumericT, F> const & m1, S1 const & s1)
+   {
+     return matrix_expression< const matrix_base<NumericT, F>, const S1, op_div>(m1, s1);
+   }
+ 
+ 
+   // operator /=
+ 
+   /** @brief Scales a matrix by a GPU scalar value
+   */
+   template <typename NumericT, typename F, typename S1>
+   typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                 matrix_base<NumericT, F> &
+                               >::type
+   operator /= (matrix_base<NumericT, F> & m1, S1 const & gpu_val)
+   {
+     //viennacl::linalg::inplace_divide(*this, gpu_val);
+     viennacl::linalg::am(m1,
+                          m1, gpu_val, 1, true, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+     return m1;
+   }
+ 
+ 
+ 
+ 
+ 
+   // outer_prod(v1, v2) * val;
+   template <typename NumericT, typename S1>
+   typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                 viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                              const S1,
+                                                              op_mult>
+                               >::type
+   operator*(const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy,
+             const S1 & val)
+   {
+     return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                         const S1,
+                                         op_mult>(proxy, val);
+   }
+ 
+   template <typename NumericT, typename S1>
+   typename viennacl::enable_if< viennacl::is_cpu_scalar<S1>::value,
+                                 viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                               const NumericT,
+                                                               op_mult>
+                               >::type
+   operator*(const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy,
+             const S1 & val)
+   {
+     return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                         const NumericT,
+                                         op_mult>(proxy, NumericT(val));
+   }
+ 
+   // val * outer_prod(v1, v2);
+   template <typename NumericT, typename S1>
+   typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                 viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                              const S1,
+                                                              op_mult>
+                               >::type
+   operator*(const S1 & val,
+             const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+   {
+     return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                         const S1,
+                                         op_mult>(proxy, val);
+   }
+ 
+   template<typename NumericT, typename S1>
+   typename viennacl::enable_if< viennacl::is_cpu_scalar<S1>::value,
+                                 viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                              const NumericT,
+                                                              op_mult>
+                               >::type
+   operator*(const S1 & val,
+             const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+   {
+     return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                         const NumericT,
+                                         op_mult>(proxy, NumericT(val));
+   }
+ 
+ 
+ 
+   //
+   // Specify available operations:
+   //
+ 
+   /** \cond */
+ 
+   namespace linalg
+   {
+     namespace detail
+     {
+ 
+       // x = y
+       template <typename T, typename F>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_base<T, F> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_base<T, F> const & rhs)
+         {
+           viennacl::linalg::am(lhs, rhs, T(1), 1, false, false);
+         }
+       };
+ 
+       // x += y
+       template <typename T, typename F>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_base<T, F> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_base<T, F> const & rhs)
+         {
+           viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, false);
+         }
+       };
+ 
+       // x -= y
+       template <typename T, typename F>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_base<T, F> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_base<T, F> const & rhs)
+         {
+           viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, true);
+         }
+       };
+ 
+       ///////////// x  OP  y * alpha ////////////////////////
+ 
+ 
+       // x = alpha * y
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> const & proxy)
+         {
+           viennacl::linalg::am(lhs, proxy.lhs(), proxy.rhs(), 1, false, false);
+         }
+       };
+ 
+       // x += alpha * y
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, false);
+         }
+       };
+ 
+       // x -= alpha * y
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, true);
+         }
+       };
+ 
+ 
+       ///////////// x  OP  vec_expr * alpha ////////////////////////
+ 
+       // x = alpha * vec_expr
+       template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+       {
+           static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+           {
+             matrix<T, F> temp(proxy.lhs());
+             lhs = temp * proxy.rhs();
+           }
+       };
+ 
+       // x += alpha * vec_expr
+       template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+       {
+           static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+           {
+             matrix<T, F> temp(proxy.lhs());
+             lhs += temp * proxy.rhs();
+           }
+       };
+ 
+       // x -= alpha * vec_expr
+       template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+       {
+           static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+           {
+             matrix<T, F> temp(proxy.lhs());
+             lhs -= temp * proxy.rhs();
+           }
+       };
+ 
+ 
+       ///////////// x  OP  y / alpha ////////////////////////
+ 
+       // x = y / alpha
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> const & proxy)
+         {
+           viennacl::linalg::am(lhs, proxy.lhs(), proxy.rhs(), 1, true, false);
+         }
+       };
+ 
+       // x += y / alpha
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, false);
+         }
+       };
+ 
+       // x -= y / alpha
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+       ///////////// x  OP  vec_expr / alpha ////////////////////////
+ 
+       // x = vec_expr / alpha
+       template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+       {
+           static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+           {
+             matrix<T, F> temp(proxy.lhs());
+             lhs = temp / proxy.rhs();
+           }
+       };
+ 
+       // x += vec_expr / alpha
+       template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+       {
+           static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+           {
+             matrix<T, F> temp(proxy.lhs());
+             lhs += temp / proxy.rhs();
+           }
+       };
+ 
+       // x -= vec_expr / alpha
+       template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+       {
+           static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+           {
+             matrix<T, F> temp(proxy.lhs());
+             lhs -= temp / proxy.rhs();
+           }
+       };
+ 
+ 
+ 
+       // generic x = vec_expr1 + vec_expr2:
+       template <typename T, typename F, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_add> >
+       {
+         // generic x = vec_expr1 + vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_add> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             matrix_base<T, F> temp(proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+             lhs = temp;
+           }
+           else
+           {
+             op_executor<matrix_base<T, F>, op_assign, LHS>::apply(lhs, proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x = y + z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x = alpha * y + z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x = y / alpha + z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x = y + beta * z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x = y + z / beta
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x = alpha * y + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x = alpha * y + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x = y / alpha + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x = y / alpha + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+       };
+ 
+       // dense = sparse * dense
+       template <typename T, typename F1, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F1>, op_assign, matrix_expression<const LHS, const RHS, op_prod> >
+       {
+         template < typename SparseMatrixType, typename F2 >
+         static void apply(matrix_base<T, F1> & lhs, matrix_expression<const SparseMatrixType,
+                                                                      const viennacl::matrix_base<T, F2>,
+                                                                      viennacl::op_prod> const & proxy)
+         {
+           viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), lhs);
+         }
+ 
+         // dense = sparse * trans(dense)
+         template < typename SparseMatrixType, typename F2 >
+         static void apply(matrix_base<T, F1> & lhs, matrix_expression<const SparseMatrixType,
+                                                                      const viennacl::matrix_expression< const viennacl::matrix_base<T, F2>,
+                                                                                                         const viennacl::matrix_base<T, F2>,
+                                                                                                         viennacl::op_trans >,
+                                                                      viennacl::op_prod> const & proxy)
+         {
+           viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), lhs);
+         }
+ 
+       };
+ 
+       // generic x += vec_expr1 + vec_expr2:
+       template <typename T, typename F, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_add> >
+       {
+         // generic x += vec_expr1 + vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_add> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             matrix_base<T, F> temp(proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+             lhs += temp;
+           }
+           else
+           {
+             op_executor<matrix_base<T, F>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x += y + z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x += alpha * y + z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x += y / alpha + z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x += y + beta * z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x += y + z / beta
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x += alpha * y + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x += alpha * y + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x += y / alpha + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x += y / alpha + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+       };
+ 
+ 
+ 
+       // generic x -= vec_expr1 + vec_expr2:
+       template <typename T, typename F, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_add> >
+       {
+         // generic x -= vec_expr1 + vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_add> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             matrix_base<T, F> temp(proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+             lhs -= temp;
+           }
+           else
+           {
+             op_executor<matrix_base<T, F>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x -= y + z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x -= alpha * y + z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x -= y / alpha + z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x -= y + beta * z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x -= y + z / beta
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x -= alpha * y + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x -= alpha * y + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x -= y / alpha + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x -= y / alpha + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+ 
+       ///////////////////////
+ 
+ 
+ 
+       // generic x = vec_expr1 - vec_expr2:
+       template <typename T, typename F, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_sub> >
+       {
+         // generic x = vec_expr1 - vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_sub> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             matrix_base<T, F> temp(proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+             lhs = temp;
+           }
+           else
+           {
+             op_executor<matrix_base<T, F>, op_assign, LHS>::apply(lhs, proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x = y - z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x = alpha * y - z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x = y / alpha - z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x = y - beta * z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x = y - z / beta
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x = alpha * y - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x = alpha * y - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x = y / alpha - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x = y / alpha - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+       // generic x += vec_expr1 - vec_expr2:
+       template <typename T, typename F, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_sub> >
+       {
+         // generic x += vec_expr1 - vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_sub> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             matrix_base<T, F> temp(proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+             lhs += temp;
+           }
+           else
+           {
+             op_executor<matrix_base<T, F>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x += y - z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x += alpha * y - z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x += y / alpha - z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x += y - beta * z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x += y - z / beta
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x += alpha * y - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x += alpha * y - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x += y / alpha - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x += y / alpha - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+ 
+       // generic x -= vec_expr1 - vec_expr2:
+       template <typename T, typename F, typename LHS, typename RHS>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_sub> >
+       {
+         // generic x -= vec_expr1 - vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_sub> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             matrix_base<T, F> temp(proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+             lhs -= temp;
+           }
+           else
+           {
+             op_executor<matrix_base<T, F>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+             op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x -= y - z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x -= alpha * y - z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x -= y / alpha - z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   const matrix_base<T, F>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x -= y - beta * z
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x -= y - z / beta
+         template <typename ScalarType>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x -= alpha * y - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x -= alpha * y - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x -= y / alpha - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x -= y / alpha - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                   const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::ambm_m(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+       };
+ 
+ 
+       //////////////////// diag(), row(), column() operations ////////////////////////////////////////
+ 
+       template <typename T, typename F, typename LHS>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const int, op_vector_diag> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const int, op_vector_diag> const & proxy)
+         {
+           viennacl::linalg::matrix_diag_from_vector(proxy.lhs(), proxy.rhs(), lhs);
+         }
+       };
+ 
+ 
+       template <typename T, typename LHS>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const int, op_matrix_diag> >
+       {
+         template <typename F>
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const int, op_matrix_diag> const & proxy)
+         {
+           viennacl::linalg::matrix_diag_to_vector(proxy.lhs(), proxy.rhs(), lhs);
+         }
+       };
+ 
+       template <typename T, typename LHS>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const unsigned int, op_row> >
+       {
+         template <typename F>
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const unsigned int, op_row> const & proxy)
+         {
+           viennacl::linalg::matrix_row(proxy.lhs(), proxy.rhs(), lhs);
+         }
+       };
+ 
+ 
+       template <typename T, typename LHS>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const unsigned int, op_column> >
+       {
+         template <typename F>
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const unsigned int, op_column> const & proxy)
+         {
+           viennacl::linalg::matrix_column(proxy.lhs(), proxy.rhs(), lhs);
+         }
+       };
+ 
+ 
+       //////////////////// Element-wise operations ////////////////////////////////////////
+ 
+       // generic x = mat_expr1 .* mat_expr2:
+       template <typename T, typename F, typename LHS, typename RHS, typename OP>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_element_binary<OP> > >
+       {
+         // x = y .* z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+         {
+           viennacl::linalg::element_op(lhs, proxy);
+         }
+ 
+         // x = y .* mat_expr
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.rhs());
+           viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(proxy.lhs(), temp));
+         }
+ 
+         // x = mat_expr .* z
+         template <typename LHS1, typename RHS1, typename OP1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.lhs());
+           viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp, proxy.rhs()));
+         }
+ 
+         // x = mat_expr .* mat_expr
+         template <typename LHS1, typename RHS1, typename OP1,
+                   typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>,
+                                                                   const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp1(proxy.lhs());
+           matrix<T, F> temp2(proxy.rhs());
+           viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp1, temp2));
+         }
+       };
+ 
+       // generic x += mat_expr .* mat_expr:
+       template <typename T, typename F, typename LHS, typename RHS, typename OP>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_element_binary<OP> > >
+       {
+         // x += y .* z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+         {
+           viennacl::matrix<T, F> temp(proxy);
+           lhs += temp;
+         }
+ 
+         // x += y .* mat_expr
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.rhs());
+           matrix<T, F> temp2(temp.size1(), temp.size2());
+           viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(proxy.lhs(), temp));
+           lhs += temp2;
+         }
+ 
+         // x += mat_expr .* z
+         template <typename LHS1, typename RHS1, typename OP1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.lhs());
+           matrix<T, F> temp2(temp.size1(), temp.size2());
+           viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp, proxy.rhs()));
+           lhs += temp2;
+         }
+ 
+         // x += mat_expr .* mat_expr
+         template <typename LHS1, typename RHS1, typename OP1,
+                   typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>,
+                                                                   const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp1(proxy.lhs());
+           matrix<T, F> temp2(proxy.rhs());
+           matrix<T, F> temp3(temp1.size1(), temp1.size2());
+           viennacl::linalg::element_op(temp3, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp1, temp2));
+           lhs += temp3;
+         }
+       };
+ 
+       // generic x -= mat_expr1 .* mat_expr2:
+       template <typename T, typename F, typename LHS, typename RHS, typename OP>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_element_binary<OP> > >
+       {
+ 
+         // x -= y .* z
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+         {
+           viennacl::matrix<T, F> temp(proxy);
+           lhs -= temp;
+         }
+ 
+         // x -= y .* mat_expr
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.rhs());
+           matrix<T, F> temp2(temp.size1(), temp.size2());
+           viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(proxy.lhs(), temp));
+           lhs -= temp2;
+         }
+ 
+         // x -= mat_expr .* z
+         template <typename LHS1, typename RHS1, typename OP1>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.lhs());
+           matrix<T, F> temp2(temp.size1(), temp.size2());
+           viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp, proxy.rhs()));
+           lhs -= temp2;
+         }
+ 
+         // x -= mat_expr .* mat_expr
+         template <typename LHS1, typename RHS1, typename OP1,
+                   typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>,
+                                                                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                      op_element_binary<OP> > const & proxy)
+         {
+           matrix<T, F> temp1(proxy.lhs());
+           matrix<T, F> temp2(proxy.rhs());
+           matrix<T, F> temp3(temp1.size1(), temp1.size2());
+           viennacl::linalg::element_op(temp3, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp1, temp2));
+           lhs -= temp3;
+         }
+       };
+ 
+       //////////////// unary expressions
+ 
+       template <typename T, typename F, typename LHS, typename RHS, typename OP>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_element_unary<OP> > >
+       {
+         // x = OP(y)
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+         {
+           viennacl::linalg::element_op(lhs, proxy);
+         }
+ 
+         // x = OP(vec_expr)
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                      op_element_unary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.rhs());
+           viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> >(temp, temp));
+         }
+       };
+ 
+       template <typename T, typename F, typename LHS, typename RHS, typename OP>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_element_unary<OP> > >
+       {
+         // x += OP(y)
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy);
+           lhs += temp;
+         }
+ 
+         // x += OP(vec_expr)
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                   const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_unary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.rhs());
+           viennacl::linalg::element_op(temp, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+           lhs += temp;
+         }
+       };
+ 
+       template <typename T, typename F, typename LHS, typename RHS, typename OP>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_element_unary<OP> > >
+       {
+         // x -= OP(y)
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy);
+           lhs -= temp;
+         }
+ 
+         // x -= OP(vec_expr)
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                      const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                      op_element_unary<OP> > const & proxy)
+         {
+           matrix<T, F> temp(proxy.rhs());
+           viennacl::linalg::element_op(temp, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+           lhs -= temp;
+         }
+       };
+ 
+ 
+ 
+       //////////////// Matrix - Matrix products ////////////////
+ 
+       // C = A * B
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+         }
+       };
+ 
+       // C = A * B^T
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F1>,
+                                                                          const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                          op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>,
+                                                                      const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+         }
+       };
+ 
+       // C = A^T * B
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                          const matrix_base<T, F2>,
+                                                                          op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                      const matrix_base<T, F2>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+         }
+       };
+ 
+       // C = A^T * B^T
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                          const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                          op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                      const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+         }
+       };
+ 
+ 
+       // C += A * B
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+         }
+       };
+ 
+       // C += A * B^T
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F1>,
+                                                                               const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                               op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>,
+                                                                      const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+         }
+       };
+ 
+       // C += A^T * B
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                               const matrix_base<T, F2>,
+                                                                               op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                      const matrix_base<T, F2>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+         }
+       };
+ 
+       // C += A^T * B^T
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                               const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                               op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                      const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+         }
+       };
+ 
+ 
+       // C -= A * B
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+         }
+       };
+ 
+       // C -= A * B^T
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F1>,
+                                                                               const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                               op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>,
+                                                                      const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+         }
+       };
+ 
+       // C -= A^T * B
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                               const matrix_base<T, F2>,
+                                                                               op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                      const matrix_base<T, F2>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+         }
+       };
+ 
+       // C -= A^T * B^T
+       template <typename T, typename F, typename F1, typename F2>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                               const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                               op_mat_mat_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                      const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                      op_mat_mat_prod> const & rhs)
+         {
+           viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+         }
+       };
+ 
+       ////////////////// Matrix-Vector Products ///////////////
+ 
+       // y = A * x
+       template <typename T, typename F>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> const & rhs)
+         {
+           // check for x = A * x
+           if (op_aliasing(lhs, rhs.rhs()))
+           {
+             vector_base<T> temp(rhs);
+             lhs = temp;
+           }
+           else
+             viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+         }
+       };
+ 
+       // y = A^T * x
+       template <typename T, typename F>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                       const vector_base<T>,
+                                                                       op_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                   const vector_base<T>,
+                                                                   op_prod> const & rhs)
+         {
+           // check for x = A^T * x
+           if (op_aliasing(lhs, rhs.rhs()))
+           {
+             vector_base<T> temp(rhs);
+             lhs = temp;
+           }
+           else
+             viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+         }
+       };
+ 
+ 
+       // y += A * x
+       template <typename T, typename F>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> const & rhs)
+         {
+           vector_base<T> temp(rhs);
+           lhs += temp;
+         }
+       };
+ 
+       // y += A^T * x
+       template <typename T, typename F>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                            const vector_base<T>,
+                                                                            op_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                   const vector_base<T>,
+                                                                   op_prod> const & rhs)
+         {
+           vector_base<T> temp(rhs);
+           lhs += temp;
+         }
+       };
+ 
+ 
+       // y -= A * x
+       template <typename T, typename F>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> const & rhs)
+         {
+           vector_base<T> temp(rhs);
+           lhs -= temp;
+         }
+       };
+ 
+       // y -= A^T * x
+       template <typename T, typename F>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                            const vector_base<T>,
+                                                                            op_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                   const vector_base<T>,
+                                                                   op_prod> const & rhs)
+         {
+           vector_base<T> temp(rhs);
+           lhs -= temp;
+         }
+       };
+ 
+ 
+ 
+       ////////////////// Rank-1 Updates ///////////////
+ 
+       // A = v1 * v2^T
+       template <typename T, typename F>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> const & rhs)
+         {
+           lhs.clear();
+           viennacl::linalg::scaled_rank_1_update(lhs, T(1.0), 1, false, false, rhs.lhs(), rhs.rhs());
+         }
+       };
+ 
+       // A = alpha * v1 * v2^T
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_assign, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                           const ScalarType,
+                                                                           op_mult> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                       const ScalarType,
+                                                                       op_mult> const & rhs)
+         {
+           lhs.clear();
+           viennacl::linalg::scaled_rank_1_update(lhs, rhs.rhs(), 1, false, false, rhs.lhs().lhs(), rhs.lhs().rhs());
+         }
+       };
+ 
+       // A += v1 * v2^T
+       template <typename T, typename F>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> const & rhs)
+         {
+           viennacl::linalg::scaled_rank_1_update(lhs, T(1.0), 1, false, false, rhs.lhs(), rhs.rhs());
+         }
+       };
+ 
+       // A += alpha * v1 * v2^T
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                                const ScalarType,
+                                                                                op_mult> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                       const ScalarType,
+                                                                       op_mult> const & rhs)
+         {
+           viennacl::linalg::scaled_rank_1_update(lhs, rhs.rhs(), 1, false, false, rhs.lhs().lhs(), rhs.lhs().rhs());
+         }
+       };
+ 
+       // A -= v1 * v2^T
+       template <typename T, typename F>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> const & rhs)
+         {
+           viennacl::linalg::scaled_rank_1_update(lhs, T(1.0), 1, false, true, rhs.lhs(), rhs.rhs());
+         }
+       };
+ 
+       // A -= alpha * v1 * v2^T
+       template <typename T, typename F, typename ScalarType>
+       struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                                const ScalarType,
+                                                                                op_mult> >
+       {
+         static void apply(matrix_base<T, F> & lhs, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                       const ScalarType,
+                                                                       op_mult> const & rhs)
+         {
+           viennacl::linalg::scaled_rank_1_update(lhs, rhs.rhs(), 1, false, true, rhs.lhs().lhs(), rhs.lhs().rhs());
+         }
+       };
+ 
+ 
+     } // namespace detail
+ 
+   } // namespace linalg
+ 
+   /** \endcond */
+ 
+ } //namespace viennacl
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/matrix_proxy.hpp
index 324d07c,8941de2..d7d4f50
--- a/viennacl/matrix_proxy.hpp
+++ b/viennacl/matrix_proxy.hpp
@@@ -39,177 -50,18 +50,126 @@@ namespace viennac
        typedef range::difference_type              difference_type;
        typedef value_type                          reference;
        typedef const value_type &                  const_reference;
++<<<<<<< HEAD
 +      
 +      matrix_range(MatrixType & A, 
 +                   range const & row_range,
 +                   range const & col_range) : A_(&A), row_range_(row_range), col_range_(col_range) {}
 +                   
 +      size_type start1() const { return row_range_.start(); }
 +      size_type size1() const { return row_range_.size(); }
 +
 +      size_type start2() const { return col_range_.start(); }
 +      size_type size2() const { return col_range_.size(); }
 +      
 +      ////////// operator= //////////////////////////
 +      
 +      template <typename MatrixType2>
 +      matrix_range<MatrixType> & operator = (const MatrixType2 & other) 
 +      {
 +        viennacl::linalg::assign(*this, other);
 +        return *this;
 +      }
 +
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_range<MatrixType> & operator = (const matrix_expression< MatrixType1,
 +                                                                      MatrixType2,
 +                                                                      op_prod > & proxy) 
 +      {
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_range<MatrixType> & 
 +      operator = (const matrix_expression< MatrixType1,
 +                                           MatrixType2,
 +                                           op_add > & proxy) 
 +      {
 +        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_range<MatrixType> & 
 +      operator = (const matrix_expression< MatrixType1,
 +                                           MatrixType2,
 +                                           op_sub > & proxy) 
 +      {
 +        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +
 +      ////////// operator+= //////////////////////////
 +
 +      matrix_range<MatrixType> & operator += (matrix_range<MatrixType> const & other)
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_range<MatrixType> & operator += (const matrix_expression< MatrixType1,
 +                                                                       MatrixType2,
 +                                                                       op_prod > & proxy)
 +      {
 +        MatrixType temp = proxy;
 +        viennacl::linalg::inplace_add(*this, temp);
 +        return *this;
 +      }
 +      
 +      
 +      ////////// operator-= //////////////////////////
 +      matrix_range<MatrixType> & operator -= (matrix_range<MatrixType> const & other)
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_range<MatrixType> & operator -= (const matrix_expression< MatrixType1,
 +                                                                       MatrixType2,
 +                                                                       op_prod > & proxy)
 +      {
 +        MatrixType temp = proxy;
 +        viennacl::linalg::inplace_sub(*this, temp);
 +        return *this;
 +      }
 +
 +
 +      ////////// operator*= //////////////////////////
 +
 +      template <typename T>
 +      matrix_range<MatrixType> & operator *= (T const & val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, val);
 +        return *this;
 +      }
 +      
 +      ////////// operator/= //////////////////////////
 +
 +      template <typename T>
 +      matrix_range<MatrixType> & operator /= (T const & val)
 +      {
 +        viennacl::linalg::inplace_divide(*this, val);
 +        return *this;
 +      }
++=======
++>>>>>>> upstream/1.5.1
  
-       matrix_range<MatrixType> & operator /= (cpu_value_type val)
-       {
-         viennacl::linalg::inplace_mult(*this, cpu_value_type(1.0) / val);
-         return *this;
-       }
- 
- 
-       ////////// operator+ //////////////////////////
-       
-       template <typename MatrixType2>
-       typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
-                                     matrix_expression< const matrix_range<MatrixType>,
-                                                        const MatrixType2,
-                                                        op_add > >::type
-       operator + (const MatrixType2 & other) 
-       {
-         return matrix_expression< const matrix_range<MatrixType>,
-                                   const MatrixType2,
-                                   op_add > (*this, other);
-       }
-       
-       ////////// operator- //////////////////////////
-       
-       template <typename MatrixType2>
-       typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
-                                     matrix_expression< const matrix_range<MatrixType>,
-                                                        const MatrixType2,
-                                                        op_sub > >::type
-       operator - (const MatrixType2 & other) 
-       {
-         return matrix_expression< const matrix_range<MatrixType>,
-                                   const MatrixType2,
-                                   op_sub > (*this, other);
-       }
-       
-       
-       
- 
-       //const_reference operator()(size_type i, size_type j) const { return A_(start1() + i, start2() + i); }
-       //reference operator()(size_type i, size_type j) { return A_(start1() + i, start2() + i); }
+       matrix_range(MatrixType & A,
+                    range const & row_range,
+                    range const & col_range) : base_type(A.handle(),
+                                                         row_range.size(), row_range.start(), 1, A.internal_size1(),
+                                                         col_range.size(), col_range.start(), 1, A.internal_size2()) {}
  
-       MatrixType & get() { return *A_; }
-       const MatrixType & get() const { return *A_; }
+       using base_type::operator=;
  
-     private:
-       MatrixType * A_;
-       range row_range_;
-       range col_range_;
    };
  
-   
-   /** @brief Returns an expression template class representing a transposed matrix */
-   template <typename MatrixType>
-   matrix_expression< const matrix_range<MatrixType>,
-                      const matrix_range<MatrixType>,
-                      op_trans> trans(const matrix_range<MatrixType> & mat)
-   {
-     return matrix_expression< const matrix_range<MatrixType>,
-                               const matrix_range<MatrixType>,
-                               op_trans>(mat, mat);
-   }
-   
-   
-   
-   
+ 
    /////////////////////////////////////////////////////////////
    ///////////////////////// CPU to GPU ////////////////////////
    /////////////////////////////////////////////////////////////
@@@ -220,103 -72,83 +180,162 @@@
              matrix_range<matrix<SCALARTYPE, row_major, 1> > & gpu_matrix_range )
    {
      assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
++<<<<<<< HEAD
 +           && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
 +    
 +     if ( gpu_matrix_range.start2() != 0 ||  gpu_matrix_range.size2() !=  gpu_matrix_range.get().size2())
 +     {
 +       std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
 +       
 +       //copy each stride separately:
 +       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +       {
 +         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +           entries[j] = cpu_matrix(i,j);
 +         
 +         std::size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
 +         std::size_t num_entries = gpu_matrix_range.size2();
 +         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                          sizeof(SCALARTYPE)*start_offset,
 +                                          sizeof(SCALARTYPE)*num_entries,
 +                                          &(entries[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        //std::cout << "Strided copy worked!" << std::endl;
 +       }
 +     }
 +     else
 +     {
 +       //full block can be copied: 
 +       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
 +       
 +       //copy each stride separately:
 +       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +           entries[i*gpu_matrix_range.get().internal_size2() + j] = cpu_matrix(i,j);
 +       
 +       std::size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
 +       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
 +       //std::cout << "start_offset: " << start_offset << std::endl;
 +       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                         sizeof(SCALARTYPE)*start_offset,
 +                                         sizeof(SCALARTYPE)*num_entries,
 +                                         &(entries[0]), 0, NULL, NULL);
 +       VIENNACL_ERR_CHECK(err);
 +       //std::cout << "Block copy worked!" << std::endl;
 +     }
++=======
+            && (cpu_matrix.size2() == gpu_matrix_range.size2())
+            && bool("Matrix size mismatch!"));
+ 
+     if ( gpu_matrix_range.start2() != 0)
+     {
+       std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
+ 
+       //copy each stride separately:
+       for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+       {
+         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+           entries[j] = cpu_matrix(i,j);
+ 
+         vcl_size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.internal_size2() + gpu_matrix_range.start2();
+         vcl_size_t num_entries = gpu_matrix_range.size2();
+         viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+       //std::cout << "Strided copy worked!" << std::endl;
+       }
+     }
+     else
+     {
+       //full block can be copied:
+       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.internal_size2());
+ 
+       //copy each stride separately:
+       for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+           entries[i*gpu_matrix_range.internal_size2() + j] = cpu_matrix(i,j);
+ 
+       vcl_size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.internal_size2();
+       vcl_size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.internal_size2();
+       viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+       //std::cout << "Block copy worked!" << std::endl;
+     }
++>>>>>>> upstream/1.5.1
    }
-   
+ 
    //column_major:
    template <typename CPU_MATRIX, typename SCALARTYPE>
    void copy(const CPU_MATRIX & cpu_matrix,
              matrix_range<matrix<SCALARTYPE, column_major, 1> > & gpu_matrix_range )
    {
      assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-            && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-     
-      if ( gpu_matrix_range.start1() != 0 ||  gpu_matrix_range.size1() != gpu_matrix_range.get().size1())
+            && (cpu_matrix.size2() == gpu_matrix_range.size2())
+            && bool("Matrix size mismatch!"));
+ 
+      if ( gpu_matrix_range.start1() != 0 ||  gpu_matrix_range.size1() != gpu_matrix_range.size1())
       {
         std::vector<SCALARTYPE> entries(gpu_matrix_range.size1());
-        
+ 
         //copy each stride separately:
++<<<<<<< HEAD
 +       for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +       {
 +         for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +           entries[i] = cpu_matrix(i,j);
 +         
 +         std::size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
 +         std::size_t num_entries = gpu_matrix_range.size1();
 +         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                          sizeof(SCALARTYPE)*start_offset,
 +                                          sizeof(SCALARTYPE)*num_entries,
 +                                          &(entries[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
++=======
+        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+        {
+          for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+            entries[i] = cpu_matrix(i,j);
+ 
+          vcl_size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.internal_size1() + gpu_matrix_range.start1();
+          vcl_size_t num_entries = gpu_matrix_range.size1();
+          viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
++>>>>>>> upstream/1.5.1
          //std::cout << "Strided copy worked!" << std::endl;
         }
       }
       else
       {
-        //full block can be copied: 
-        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-        
+        //full block can be copied:
+        std::vector<SCALARTYPE> entries(gpu_matrix_range.internal_size1()*gpu_matrix_range.size2());
+ 
         //copy each stride separately:
++<<<<<<< HEAD
 +       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +           entries[i + j*gpu_matrix_range.get().internal_size1()] = cpu_matrix(i,j);
 +       
 +       std::size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
 +       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
 +       //std::cout << "start_offset: " << start_offset << std::endl;
 +       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                         sizeof(SCALARTYPE)*start_offset,
 +                                         sizeof(SCALARTYPE)*num_entries,
 +                                         &(entries[0]), 0, NULL, NULL);
 +       VIENNACL_ERR_CHECK(err);
++=======
+        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+          for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+            entries[i + j*gpu_matrix_range.internal_size1()] = cpu_matrix(i,j);
+ 
+        vcl_size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.internal_size1();
+        vcl_size_t num_entries = gpu_matrix_range.internal_size1() * gpu_matrix_range.size2();
+        viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
++>>>>>>> upstream/1.5.1
         //std::cout << "Block copy worked!" << std::endl;
       }
-     
+ 
    }
  
  
@@@ -331,105 -163,84 +350,153 @@@
              CPU_MATRIX & cpu_matrix)
    {
      assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-            && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-     
-      if ( gpu_matrix_range.start2() != 0 ||  gpu_matrix_range.size2() !=  gpu_matrix_range.get().size2())
+            && (cpu_matrix.size2() == gpu_matrix_range.size2())
+            && bool("Matrix size mismatch!"));
+ 
+      if ( gpu_matrix_range.start2() != 0)
       {
         std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
-        
+ 
         //copy each stride separately:
++<<<<<<< HEAD
 +       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +       {
 +         std::size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
 +         std::size_t num_entries = gpu_matrix_range.size2();
 +         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                          sizeof(SCALARTYPE)*start_offset,
 +                                          sizeof(SCALARTYPE)*num_entries,
 +                                          &(entries[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        //std::cout << "Strided copy worked!" << std::endl;
 +        
 +        for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
++=======
+        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+        {
+          vcl_size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.internal_size2() + gpu_matrix_range.start2();
+          vcl_size_t num_entries = gpu_matrix_range.size2();
+          viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+         //std::cout << "Strided copy worked!" << std::endl;
+ 
+         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
++>>>>>>> upstream/1.5.1
            cpu_matrix(i,j) = entries[j];
-          
         }
       }
       else
       {
++<<<<<<< HEAD
 +       //full block can be copied: 
 +       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
 +       
 +       std::size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
 +       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
 +       //std::cout << "start_offset: " << start_offset << std::endl;
 +       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                         sizeof(SCALARTYPE)*start_offset,
 +                                         sizeof(SCALARTYPE)*num_entries,
 +                                         &(entries[0]), 0, NULL, NULL);
 +       VIENNACL_ERR_CHECK(err);
 +       //std::cout << "Block copy worked!" << std::endl;
 +
 +       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +           cpu_matrix(i,j) = entries[i*gpu_matrix_range.get().internal_size2() + j];
++=======
+        //full block can be copied:
+        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.internal_size2());
+ 
+        vcl_size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.internal_size2();
+        vcl_size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
+          viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+        //std::cout << "Block copy worked!" << std::endl;
+ 
+        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+          for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+            cpu_matrix(i,j) = entries[i*gpu_matrix_range.internal_size2() + j];
++>>>>>>> upstream/1.5.1
      }
-     
+ 
    }
-   
-   
+ 
+ 
    //column_major:
    template <typename CPU_MATRIX, typename SCALARTYPE>
    void copy(matrix_range<matrix<SCALARTYPE, column_major, 1> > const & gpu_matrix_range,
              CPU_MATRIX & cpu_matrix)
    {
      assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-            && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-     
-      if ( gpu_matrix_range.start1() != 0 ||  gpu_matrix_range.size1() !=  gpu_matrix_range.get().size1())
+            && (cpu_matrix.size2() == gpu_matrix_range.size2())
+            && bool("Matrix size mismatch!"));
+ 
+      if ( gpu_matrix_range.start1() != 0)
       {
         std::vector<SCALARTYPE> entries(gpu_matrix_range.size1());
-        
+ 
         //copy each stride separately:
++<<<<<<< HEAD
 +       for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +       {
 +         std::size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
 +         std::size_t num_entries = gpu_matrix_range.size1();
 +         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                          sizeof(SCALARTYPE)*start_offset,
 +                                          sizeof(SCALARTYPE)*num_entries,
 +                                          &(entries[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        //std::cout << "Strided copy worked!" << std::endl;
 +        
 +        for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
++=======
+        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+        {
+          vcl_size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.internal_size1() + gpu_matrix_range.start1();
+          vcl_size_t num_entries = gpu_matrix_range.size1();
+          viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+         //std::cout << "Strided copy worked!" << std::endl;
+ 
+         for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
++>>>>>>> upstream/1.5.1
            cpu_matrix(i,j) = entries[i];
         }
       }
       else
       {
-        //full block can be copied: 
-        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-        
+        //full block can be copied:
+        std::vector<SCALARTYPE> entries(gpu_matrix_range.internal_size1()*gpu_matrix_range.size2());
+ 
         //copy each stride separately:
++<<<<<<< HEAD
 +       std::size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
 +       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
 +       //std::cout << "start_offset: " << start_offset << std::endl;
 +       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
 +                                         sizeof(SCALARTYPE)*start_offset,
 +                                         sizeof(SCALARTYPE)*num_entries,
 +                                         &(entries[0]), 0, NULL, NULL);
 +       VIENNACL_ERR_CHECK(err);
 +       //std::cout << "Block copy worked!" << std::endl;
 +       
 +       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
 +         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
 +           cpu_matrix(i,j) = entries[i + j*gpu_matrix_range.get().internal_size1()];
++=======
+        vcl_size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.internal_size1();
+        vcl_size_t num_entries = gpu_matrix_range.internal_size1() * gpu_matrix_range.size2();
+        viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+        //std::cout << "Block copy worked!" << std::endl;
+ 
+        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+          for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+            cpu_matrix(i,j) = entries[i + j*gpu_matrix_range.internal_size1()];
++>>>>>>> upstream/1.5.1
       }
-     
+ 
    }
  
  
@@@ -454,402 -450,36 +706,429 @@@
    // Convenience function
    //
    template <typename MatrixType>
-   matrix_range<MatrixType> project(MatrixType & A, viennacl::range const & r1, viennacl::range const & r2)
+   matrix_slice<MatrixType> project(MatrixType & A, viennacl::slice const & r1, viennacl::slice const & r2)
    {
-     return matrix_range<MatrixType>(A, r1, r2);
+     assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of slice invalid!"));
+ 
+     return matrix_slice<MatrixType>(A, r1, r2);
    }
  
++<<<<<<< HEAD
 +
 +
 +
 +
 +
 +//
 +//
 +//
 +/////////////////////////////// Slice /////////////////////////////////////////////
 +//
 +//
 +//
 +
 +
 +
 +
 +
 +
 +
 +
 +
 +  template <typename MatrixType>
 +  class matrix_slice
 +  {
 +    public:
 +      typedef typename MatrixType::value_type     value_type;
 +      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
 +      typedef slice::size_type                    size_type;
 +      typedef slice::difference_type              difference_type;
 +      typedef value_type                          reference;
 +      typedef const value_type &                  const_reference;
 +      
 +      matrix_slice(MatrixType & A, 
 +                   slice const & row_slice,
 +                   slice const & col_slice) : A_(&A), row_slice_(row_slice), col_slice_(col_slice) {}
 +                   
 +      size_type start1() const { return row_slice_.start(); }
 +      size_type stride1() const { return row_slice_.stride(); }
 +      size_type size1() const { return row_slice_.size(); }
 +
 +      size_type start2() const { return col_slice_.start(); }
 +      size_type stride2() const { return col_slice_.stride(); }
 +      size_type size2() const { return col_slice_.size(); }
 +      
 +      ////////// operator= //////////////////////////
 +      
 +      template <typename MatrixType2>
 +      matrix_slice<MatrixType> & operator = (const MatrixType2 & other) 
 +      {
 +        viennacl::linalg::assign(*this, other);
 +        return *this;
 +      }
 +
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_slice<MatrixType> & operator = (const matrix_expression< MatrixType1,
 +                                                                      MatrixType2,
 +                                                                      op_prod > & proxy) 
 +      {
 +        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_slice<MatrixType> & 
 +      operator = (const matrix_expression< MatrixType1,
 +                                           MatrixType2,
 +                                           op_add > & proxy) 
 +      {
 +        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_slice<MatrixType> & 
 +      operator = (const matrix_expression< MatrixType1,
 +                                           MatrixType2,
 +                                           op_sub > & proxy) 
 +      {
 +        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +
 +      ////////// operator+= //////////////////////////
 +
 +      matrix_slice<MatrixType> & operator += (matrix_slice<MatrixType> const & other)
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_slice<MatrixType> & operator += (const matrix_expression< MatrixType1,
 +                                                                       MatrixType2,
 +                                                                       op_prod > & proxy)
 +      {
 +        MatrixType temp = proxy;
 +        viennacl::linalg::inplace_add(*this, temp);
 +        return *this;
 +      }
 +      
 +      
 +      ////////// operator-= //////////////////////////
 +      matrix_slice<MatrixType> & operator -= (matrix_slice<MatrixType> const & other)
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +      
 +      template <typename MatrixType1, typename MatrixType2>
 +      matrix_slice<MatrixType> & operator -= (const matrix_expression< MatrixType1,
 +                                                                       MatrixType2,
 +                                                                       op_prod > & proxy)
 +      {
 +        MatrixType temp = proxy;
 +        viennacl::linalg::inplace_sub(*this, temp);
 +        return *this;
 +      }
 +
 +
 +      ////////// operator*= //////////////////////////
 +
 +      template <typename T>
 +      matrix_slice<MatrixType> & operator *= (T const & val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, val);
 +        return *this;
 +      }
 +      
 +      ////////// operator/= //////////////////////////
 +
 +      template <typename T>
 +      matrix_slice<MatrixType> & operator /= (T const & val)
 +      {
 +        viennacl::linalg::inplace_divide(*this, val);
 +        return *this;
 +      }
 +
 +      matrix_slice<MatrixType> & operator /= (cpu_value_type val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, cpu_value_type(1.0) / val);
 +        return *this;
 +      }
 +
 +
 +      ////////// operator+ //////////////////////////
 +      
 +      template <typename MatrixType2>
 +      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
 +                                    matrix_expression< const matrix_slice<MatrixType>,
 +                                                       const MatrixType2,
 +                                                       op_add > >::type
 +      operator + (const MatrixType2 & other) 
 +      {
 +        return matrix_expression< const matrix_slice<MatrixType>,
 +                                  const MatrixType2,
 +                                  op_add > (*this, other);
 +      }
 +      
 +      ////////// operator- //////////////////////////
 +      
 +      template <typename MatrixType2>
 +      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
 +                                    matrix_expression< const matrix_slice<MatrixType>,
 +                                                       const MatrixType2,
 +                                                       op_sub > >::type
 +      operator - (const MatrixType2 & other) 
 +      {
 +        return matrix_expression< const matrix_slice<MatrixType>,
 +                                  const MatrixType2,
 +                                  op_sub > (*this, other);
 +      }
 +      
 +      
 +      
 +
 +      //const_reference operator()(size_type i, size_type j) const { return A_(start1() + i, start2() + i); }
 +      //reference operator()(size_type i, size_type j) { return A_(start1() + i, start2() + i); }
 +
 +      MatrixType & get() { return *A_; }
 +      const MatrixType & get() const { return *A_; }
 +
 +    private:
 +      MatrixType * A_;
 +      slice row_slice_;
 +      slice col_slice_;
 +  };
 +
 +  
 +  /** @brief Returns an expression template class representing a transposed matrix */
 +  template <typename MatrixType>
 +  matrix_expression< const matrix_slice<MatrixType>,
 +                     const matrix_slice<MatrixType>,
 +                     op_trans> trans(const matrix_slice<MatrixType> & mat)
 +  {
 +    return matrix_expression< const matrix_slice<MatrixType>,
 +                              const matrix_slice<MatrixType>,
 +                              op_trans>(mat, mat);
 +  }
 +  
 +  
 +  
 +  
 +  /////////////////////////////////////////////////////////////
 +  ///////////////////////// CPU to GPU ////////////////////////
 +  /////////////////////////////////////////////////////////////
 +  
 +  //row_major:
 +  template <typename CPU_MATRIX, typename SCALARTYPE>
 +  void copy(const CPU_MATRIX & cpu_matrix,
 +            matrix_slice<matrix<SCALARTYPE, row_major, 1> > & gpu_matrix_slice )
 +  {
 +    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
 +           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
 +    
 +     if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
 +     {
 +       std::size_t num_entries = gpu_matrix_slice.size2() * gpu_matrix_slice.stride2(); //no. of entries per stride
 +       
 +       std::vector<SCALARTYPE> entries(num_entries);
 +       
 +       //copy each stride separately:
 +       for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
 +       {
 +         std::size_t start_offset = (gpu_matrix_slice.start1() + i * gpu_matrix_slice.stride1()) * gpu_matrix_slice.get().internal_size2() + gpu_matrix_slice.start2();
 +         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_matrix_slice.get().handle().get(), CL_TRUE, 
 +                                          sizeof(SCALARTYPE)*start_offset,
 +                                          sizeof(SCALARTYPE)*num_entries,
 +                                          &(entries[0]), 0, NULL, NULL);
 +         VIENNACL_ERR_CHECK(err);
 +         
 +         for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
 +           entries[j * gpu_matrix_slice.stride2()] = cpu_matrix(i,j);
 +         
 +         err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                    gpu_matrix_slice.get().handle().get(), CL_TRUE, 
 +                                    sizeof(SCALARTYPE)*start_offset,
 +                                    sizeof(SCALARTYPE)*num_entries,
 +                                    &(entries[0]), 0, NULL, NULL);
 +         
 +         VIENNACL_ERR_CHECK(err);
 +       }
 +     }
 +  }
 +  
 +  //column_major:
 +  template <typename CPU_MATRIX, typename SCALARTYPE>
 +  void copy(const CPU_MATRIX & cpu_matrix,
 +            matrix_slice<matrix<SCALARTYPE, column_major, 1> > & gpu_matrix_slice )
 +  {
 +    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
 +           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
 +    
 +    if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
 +    {
 +      std::size_t num_entries = gpu_matrix_slice.size1() * gpu_matrix_slice.stride1(); //no. of entries per stride
 +      
 +      std::vector<SCALARTYPE> entries(num_entries);
 +      
 +      //copy each column stride separately:
 +      for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
 +      {
 +        std::size_t start_offset = gpu_matrix_slice.start1() + (gpu_matrix_slice.start2() + j * gpu_matrix_slice.stride2()) * gpu_matrix_slice.get().internal_size1();
 +        
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                        gpu_matrix_slice.get().handle().get(), CL_TRUE, 
 +                                        sizeof(SCALARTYPE)*start_offset,
 +                                        sizeof(SCALARTYPE)*num_entries,
 +                                        &(entries[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        
 +        for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
 +          entries[i * gpu_matrix_slice.stride1()] = cpu_matrix(i,j);
 +        
 +        err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                   gpu_matrix_slice.get().handle().get(), CL_TRUE, 
 +                                   sizeof(SCALARTYPE)*start_offset,
 +                                   sizeof(SCALARTYPE)*num_entries,
 +                                   &(entries[0]), 0, NULL, NULL);
 +        
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +    }
 +    
 +  }
 +
 +
 +  /////////////////////////////////////////////////////////////
 +  ///////////////////////// GPU to CPU ////////////////////////
 +  /////////////////////////////////////////////////////////////
 +  
 +  
 +  //row_major:
 +  template <typename CPU_MATRIX, typename SCALARTYPE>
 +  void copy(matrix_slice<matrix<SCALARTYPE, row_major, 1> > const & gpu_matrix_slice,
 +            CPU_MATRIX & cpu_matrix)
 +  {
 +    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
 +           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
 +    
 +     if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
 +     {
 +       std::size_t num_entries = gpu_matrix_slice.size2() * gpu_matrix_slice.stride2(); //no. of entries per stride
 +       
 +       std::vector<SCALARTYPE> entries(num_entries);
 +       
 +       //copy each stride separately:
 +       for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
 +       {
 +         std::size_t start_offset = (gpu_matrix_slice.start1() + i * gpu_matrix_slice.stride1()) * gpu_matrix_slice.get().internal_size2() + gpu_matrix_slice.start2();
 +         
 +         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_matrix_slice.get().handle().get(), CL_TRUE, 
 +                                          sizeof(SCALARTYPE)*start_offset,
 +                                          sizeof(SCALARTYPE)*num_entries,
 +                                          &(entries[0]), 0, NULL, NULL);
 +         
 +         for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
 +           cpu_matrix(i,j) = entries[j * gpu_matrix_slice.stride2()];
 +         
 +        VIENNACL_ERR_CHECK(err);
 +       }
 +     }
 +    
 +  }
 +  
 +  
 +  //column_major:
 +  template <typename CPU_MATRIX, typename SCALARTYPE>
 +  void copy(matrix_slice<matrix<SCALARTYPE, column_major, 1> > const & gpu_matrix_slice,
 +            CPU_MATRIX & cpu_matrix)
 +  {
 +    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
 +           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
 +    
 +    if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
 +    {
 +      std::size_t num_entries = gpu_matrix_slice.size1() * gpu_matrix_slice.stride1(); //no. of entries per stride
 +      
 +      std::vector<SCALARTYPE> entries(num_entries);
 +      
 +      //copy each column stride separately:
 +      for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
 +      {
 +        std::size_t start_offset = gpu_matrix_slice.start1() + (gpu_matrix_slice.start2() + j * gpu_matrix_slice.stride2()) * gpu_matrix_slice.get().internal_size1();
 +        
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                        gpu_matrix_slice.get().handle().get(), CL_TRUE, 
 +                                        sizeof(SCALARTYPE)*start_offset,
 +                                        sizeof(SCALARTYPE)*num_entries,
 +                                        &(entries[0]), 0, NULL, NULL);
 +        
 +        for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
 +          cpu_matrix(i,j) = entries[i * gpu_matrix_slice.stride1()];
 +        
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +    }
 +    
 +  }
 +
 +
 +  template<typename MatrixType>
 +  std::ostream & operator<<(std::ostream & s, matrix_slice<MatrixType> const & proxy)
 +  {
 +    MatrixType temp = proxy;
 +    s << temp;
 +    return s;
 +  }
 +
 +  template<typename MatrixType>
 +  std::ostream & operator<<(std::ostream & s, matrix_slice<const MatrixType> const & proxy)
 +  {
 +    MatrixType temp = proxy;
 +    s << temp;
 +    return s;
 +  }
 +
 +
 +  //
 +  // Convenience function
 +  //
 +  template <typename MatrixType>
 +  matrix_slice<MatrixType> project(MatrixType & A, viennacl::slice const & r1, viennacl::slice const & r2)
 +  {
 +    return matrix_slice<MatrixType>(A, r1, r2);
 +  }
 +
++=======
+   template <typename MatrixType>
+   matrix_slice<MatrixType> project(matrix_range<MatrixType> & A, viennacl::slice const & r1, viennacl::slice const & r2)
+   {
+     assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of slice invalid!"));
+ 
+     return matrix_slice<MatrixType>(A,
+                                     viennacl::slice(A.start1() + r1.start(), r1.stride(), r1.size()),
+                                     viennacl::slice(A.start2() + r2.start(), r2.stride(), r2.size())
+                                    );
+   }
+ 
+   template <typename MatrixType>
+   matrix_slice<MatrixType> project(matrix_slice<MatrixType> & A, viennacl::slice const & r1, viennacl::slice const & r2)
+   {
+     assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of slice invalid!"));
+ 
+     return matrix_slice<MatrixType>(A,
+                                     viennacl::slice(A.start1() + r1.start(), A.stride1() * r1.stride(), r1.size()),
+                                     viennacl::slice(A.start2() + r2.start(), A.stride2() * r2.stride(), r2.size())
+                                    );
+   }
++>>>>>>> upstream/1.5.1
  
+   // TODO: Allow mix of range/slice
  
  }
  
diff --cc viennacl/meta/predicate.hpp
index a3c7151,fac514e..5efb701
--- a/viennacl/meta/predicate.hpp
+++ b/viennacl/meta/predicate.hpp
@@@ -1,125 -1,511 +1,639 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_META_PREDICATE_HPP_
 +#define VIENNACL_META_PREDICATE_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file predicate.hpp
 +    @brief All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +
 +namespace viennacl
 +{
 +    //
 +    // is_cpu_scalar: checks for float or double
 +    //
 +    template <typename T>
 +    struct is_cpu_scalar
 +    {
 +      enum { value = false };
 +    };
 +  
 +    template <>
 +    struct is_cpu_scalar<float>
 +    {
 +      enum { value = true };
 +    };
 +
 +    template <>
 +    struct is_cpu_scalar<double>
 +    {
 +      enum { value = true };
 +    };
 +    
 +    //
 +    // is_scalar: checks for viennacl::scalar
 +    //
 +    template <typename T>
 +    struct is_scalar
 +    {
 +      enum { value = false };
 +    };
 +  
 +    template <typename T>
 +    struct is_scalar<viennacl::scalar<T> >
 +    {
 +      enum { value = true };
 +    };
 +  
 +    //
 +    // is_vector
 +    //
 +    template <typename T>
 +    struct is_vector
 +    {
 +      enum { value = false };
 +    };
 +
 +    template <typename ScalarType, unsigned int ALIGNMENT>
 +    struct is_vector<viennacl::vector<ScalarType, ALIGNMENT> >
 +    {
 +      enum { value = true };
 +    };
 +
 +    template <typename T>
 +    struct is_vector<viennacl::vector_range<T> >
 +    {
 +      enum { value = true };
 +    };
 +    
 +    template <typename T>
 +    struct is_vector<viennacl::vector_slice<T> >
 +    {
 +      enum { value = true };
 +    };
 +    
 +    
 +    //
 +    // is_matrix
 +    //
 +    template <typename T>
 +    struct is_matrix
 +    {
 +      enum { value = false };
 +    };
 +
 +    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
 +    struct is_matrix<viennacl::matrix<ScalarType, F, ALIGNMENT> >
 +    {
 +      enum { value = true };
 +    };
 +
 +    template <typename T>
 +    struct is_matrix<viennacl::matrix_range<T> >
 +    {
 +      enum { value = true };
 +    };
 +    
 +    template <typename T>
 +    struct is_matrix<viennacl::matrix_slice<T> >
 +    {
 +      enum { value = true };
 +    };
 +
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_META_PREDICATE_HPP_
+ #define VIENNACL_META_PREDICATE_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file predicate.hpp
+     @brief All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+ #ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #else
+ #include "CL/cl.h"
+ #endif
+ #endif
+ 
+ namespace viennacl
+ {
+ 
+     //
+     // is_cpu_scalar: checks for float or double
+     //
+     //template <typename T>
+     //struct is_cpu_scalar
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <> struct is_cpu_scalar<char>           { enum { value = true }; };
+     template <> struct is_cpu_scalar<unsigned char>  { enum { value = true }; };
+     template <> struct is_cpu_scalar<short>          { enum { value = true }; };
+     template <> struct is_cpu_scalar<unsigned short> { enum { value = true }; };
+     template <> struct is_cpu_scalar<int>            { enum { value = true }; };
+     template <> struct is_cpu_scalar<unsigned int>   { enum { value = true }; };
+     template <> struct is_cpu_scalar<long>           { enum { value = true }; };
+     template <> struct is_cpu_scalar<unsigned long>  { enum { value = true }; };
+     template <> struct is_cpu_scalar<float>          { enum { value = true }; };
+     template <> struct is_cpu_scalar<double>         { enum { value = true }; };
+     /** \endcond */
+ 
+ 
+     //
+     // is_scalar: checks for viennacl::scalar
+     //
+     //template <typename T>
+     //struct is_scalar
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename T>
+     struct is_scalar<viennacl::scalar<T> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_flip_sign_scalar: checks for viennacl::scalar modified with unary operator-
+     //
+     //template <typename T>
+     //struct is_flip_sign_scalar
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename T>
+     struct is_flip_sign_scalar<viennacl::scalar_expression< const scalar<T>,
+                                                             const scalar<T>,
+                                                             op_flip_sign> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_any_scalar: checks for either CPU and GPU scalars, i.e. is_cpu_scalar<>::value || is_scalar<>::value
+     //
+     //template <typename T>
+     //struct is_any_scalar
+     //{
+     //  enum { value = (is_scalar<T>::value || is_cpu_scalar<T>::value || is_flip_sign_scalar<T>::value )};
+     //};
+ 
+     //
+ 
+       /** \cond */
+   #define VIENNACL_MAKE_ANY_VECTOR_TRUE(type) template<> struct is_any_vector< type > { enum { value = 1 }; };
+   #define VIENNACL_MAKE_FOR_ALL_SCALARTYPE(type) \
+     VIENNACL_MAKE_ANY_VECTOR_TRUE(type<float>)\
+     VIENNACL_MAKE_ANY_VECTOR_TRUE(type<double>)
+ 
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::vector)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::vector_range)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::vector_slice)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::unit_vector)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::zero_vector)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::one_vector)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::scalar_vector)
+ 
+   #undef VIENNACL_MAKE_FOR_ALL_SCALARTYPE
+   #undef VIENNACL_MAKE_ANY_VECTOR_TRUE
+       /** \endcond */
+ 
+ 
+       /** \cond */
+   #define VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE)\
+     template<> struct is_any_dense_matrix< TYPE > { enum { value = 1 }; };
+ 
+   #define VIENNACL_MAKE_FOR_ALL_SCALARTYPE(TYPE) \
+     VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<float>)\
+     VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<double>)
+ 
+   #define COMMA ,
+   #define VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(TYPE) \
+     VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<float COMMA viennacl::row_major>)\
+     VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<double COMMA viennacl::row_major>)\
+     VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<float COMMA viennacl::column_major>)\
+     VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<double COMMA viennacl::column_major>)
+ 
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(viennacl::matrix)
+ //    VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(viennacl::matrix_range)
+ //    VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(viennacl::matrix_slice)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::identity_matrix)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::zero_matrix)
+     VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::scalar_matrix)
+ 
+   #undef VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT
+   #undef VIENNACL_MAKE_FOR_ALL_SCALARTYPE
+   #undef VIENNACL_MAKE_ANY_MATRIX_TRUE
+       /** \endcond */
+ 
+     //
+     // is_row_major
+     //
+     //template <typename T>
+     //struct is_row_major
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename ScalarType>
+     struct is_row_major<viennacl::matrix_base<ScalarType, viennacl::row_major> >
+     {
+       enum { value = true };
+     };
+ 
+     template <>
+     struct is_row_major< viennacl::row_major >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename T>
+     struct is_row_major<viennacl::matrix_expression<T, T, viennacl::op_trans> >
+     {
+       enum { value = is_row_major<T>::value };
+     };
+     /** \endcond */
+ 
+ 
+     //
+     // is_circulant_matrix
+     //
+     //template <typename T>
+     //struct is_circulant_matrix
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_circulant_matrix<viennacl::circulant_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_circulant_matrix<const viennacl::circulant_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_hankel_matrix
+     //
+     //template <typename T>
+     //struct is_hankel_matrix
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_hankel_matrix<viennacl::hankel_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_hankel_matrix<const viennacl::hankel_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_toeplitz_matrix
+     //
+     //template <typename T>
+     //struct is_toeplitz_matrix
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_toeplitz_matrix<viennacl::toeplitz_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_toeplitz_matrix<const viennacl::toeplitz_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_vandermonde_matrix
+     //
+     //template <typename T>
+     //struct is_vandermonde_matrix
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_vandermonde_matrix<viennacl::vandermonde_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_vandermonde_matrix<const viennacl::vandermonde_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+ 
+     //
+     // is_compressed_matrix
+     //
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_compressed_matrix<viennacl::compressed_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_coordinate_matrix
+     //
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_coordinate_matrix<viennacl::coordinate_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_ell_matrix
+     //
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_ell_matrix<viennacl::ell_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_hyb_matrix
+     //
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_hyb_matrix<viennacl::hyb_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+ 
+     //
+     // is_any_sparse_matrix
+     //
+     //template <typename T>
+     //struct is_any_sparse_matrix
+     //{
+     //  enum { value = false };
+     //};
+ 
+     /** \cond */
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_any_sparse_matrix<viennacl::compressed_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType>
+     struct is_any_sparse_matrix<viennacl::compressed_compressed_matrix<ScalarType> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_any_sparse_matrix<viennacl::coordinate_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_any_sparse_matrix<viennacl::ell_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     struct is_any_sparse_matrix<viennacl::hyb_matrix<ScalarType, ALIGNMENT> >
+     {
+       enum { value = true };
+     };
+ 
+     template <typename T>
+     struct is_any_sparse_matrix<const T>
+     {
+       enum { value = is_any_sparse_matrix<T>::value };
+     };
+ 
+     /** \endcond */
+ 
+     //////////////// Part 2: Operator predicates ////////////////////
+ 
+     //
+     // is_addition
+     //
+     /** @brief Helper metafunction for checking whether the provided type is viennacl::op_add (for addition) */
+     template <typename T>
+     struct is_addition
+     {
+       enum { value = false };
+     };
+ 
+     /** \cond */
+     template <>
+     struct is_addition<viennacl::op_add>
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_subtraction
+     //
+     /** @brief Helper metafunction for checking whether the provided type is viennacl::op_sub (for subtraction) */
+     template <typename T>
+     struct is_subtraction
+     {
+       enum { value = false };
+     };
+ 
+     /** \cond */
+     template <>
+     struct is_subtraction<viennacl::op_sub>
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_product
+     //
+     /** @brief Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multiplication) */
+     template <typename T>
+     struct is_product
+     {
+       enum { value = false };
+     };
+ 
+     /** \cond */
+     template <>
+     struct is_product<viennacl::op_prod>
+     {
+       enum { value = true };
+     };
+ 
+     template <>
+     struct is_product<viennacl::op_mult>
+     {
+       enum { value = true };
+     };
+ 
+     template <>
+     struct is_product<viennacl::op_element_binary<op_prod> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+     //
+     // is_division
+     //
+     /** @brief Helper metafunction for checking whether the provided type is viennacl::op_div (for division) */
+     template <typename T>
+     struct is_division
+     {
+       enum { value = false };
+     };
+ 
+     /** \cond */
+     template <>
+     struct is_division<viennacl::op_div>
+     {
+       enum { value = true };
+     };
+ 
+     template <>
+     struct is_division<viennacl::op_element_binary<op_div> >
+     {
+       enum { value = true };
+     };
+     /** \endcond */
+ 
+         // is_primitive_type
+     //
+ 
+     /** @brief Helper class for checking whether a type is a primitive type. */
+     template<class T>
+     struct is_primitive_type{ enum {value = false}; };
+ 
+     /** \cond */
+     template<> struct is_primitive_type<float>         { enum { value = true }; };
+     template<> struct is_primitive_type<double>        { enum { value = true }; };
+     template<> struct is_primitive_type<unsigned int>  { enum { value = true }; };
+     template<> struct is_primitive_type<int>           { enum { value = true }; };
+     template<> struct is_primitive_type<unsigned char> { enum { value = true }; };
+     template<> struct is_primitive_type<char>          { enum { value = true }; };
+     template<> struct is_primitive_type<unsigned long> { enum { value = true }; };
+     template<> struct is_primitive_type<long>          { enum { value = true }; };
+     template<> struct is_primitive_type<unsigned short>{ enum { value = true }; };
+     template<> struct is_primitive_type<short>         { enum { value = true }; };
+     /** \endcond */
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+ 
+     /** @brief Helper class for checking whether a particular type is a native OpenCL type. */
+     template<class T>
+     struct is_cl_type{ enum { value = false }; };
+ 
+     /** \cond */
+     template<> struct is_cl_type<cl_float> { enum { value = true }; };
+     template<> struct is_cl_type<cl_double>{ enum { value = true }; };
+     template<> struct is_cl_type<cl_uint>  { enum { value = true }; };
+     template<> struct is_cl_type<cl_int>   { enum { value = true }; };
+     template<> struct is_cl_type<cl_uchar> { enum { value = true }; };
+     template<> struct is_cl_type<cl_char>  { enum { value = true }; };
+     template<> struct is_cl_type<cl_ulong> { enum { value = true }; };
+     template<> struct is_cl_type<cl_long>  { enum { value = true }; };
+     template<> struct is_cl_type<cl_ushort>{ enum { value = true }; };
+     template<> struct is_cl_type<cl_short> { enum { value = true }; };
+     /** \endcond */
+ 
+ #endif
+ 
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/meta/result_of.hpp
index ba52a8f,579c5db..fc07f68
--- a/viennacl/meta/result_of.hpp
+++ b/viennacl/meta/result_of.hpp
@@@ -1,285 -1,631 +1,919 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_META_RESULT_OF_HPP_
 +#define VIENNACL_META_RESULT_OF_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file viennacl/meta/result_of.hpp
 +    @brief A collection of compile time type deductions
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +
 +
 +#ifdef VIENNACL_HAVE_UBLAS  
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/matrix.hpp>
 +#endif
 +
 +#ifdef VIENNACL_HAVE_EIGEN  
 +#include <Eigen/Core>
 +#include <Eigen/Sparse>
 +#endif
 +
 +#ifdef VIENNACL_HAVE_MTL4
 +#include <boost/numeric/mtl/mtl.hpp>
 +#endif
 +
 +#include <vector>
 +#include <map>
 +
 +namespace viennacl
 +{
 +    namespace result_of
 +    {
 +      //
 +      // Retrieve size_type 
 +      //
 +      template <typename T>
 +      struct size_type
 +      {
 +        typedef typename T::size_type   type;
 +      };
 +
 +      #ifdef VIENNACL_HAVE_EIGEN
 +      template <class T, int a, int b, int c, int d, int e>
 +      struct size_type< Eigen::Matrix<T, a, b, c, d, e> >
 +      {
 +        typedef std::size_t   type;
 +      };
 +      
 +      template <>
 +      struct size_type<Eigen::VectorXf>
 +      {
 +        typedef std::size_t   type;
 +      };
 +      
 +      template <>
 +      struct size_type<Eigen::VectorXd>
 +      {
 +        typedef std::size_t   type;
 +      };
 +
 +      template <typename T, int options>
 +      struct size_type<Eigen::SparseMatrix<T, options> >
 +      {
 +        typedef std::size_t   type;
 +      };
 +      #endif
 +      
 +      //
 +      // Retrieve value_type:
 +      //
 +      template <typename T>
 +      struct value_type
 +      {
 +        typedef typename T::value_type    type; 
 +      };
 +
 +      //
 +      // Retrieve cpu value_type:
 +      //
 +      template <typename T>
 +      struct cpu_value_type
 +      {
 +        typedef typename T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T    type; 
 +      };
 +
 +      template <>
 +      struct cpu_value_type<float>
 +      {
 +        typedef float    type; 
 +      };
 +      
 +      template <>
 +      struct cpu_value_type<double>
 +      {
 +        typedef double    type; 
 +      };
 +      
 +      template <typename T>
 +      struct cpu_value_type<viennacl::scalar<T> >
 +      {
 +        typedef T    type; 
 +      };
 +
 +      template <typename T, unsigned int ALIGNMENT>
 +      struct cpu_value_type<viennacl::vector<T, ALIGNMENT> >
 +      {
 +        typedef T    type; 
 +      };
 +
 +      template <typename T>
 +      struct cpu_value_type<viennacl::vector_range<T> >
 +      {
 +        typedef typename cpu_value_type<T>::type    type; 
 +      };
 +
 +      template <typename T>
 +      struct cpu_value_type<viennacl::vector_slice<T> >
 +      {
 +        typedef typename cpu_value_type<T>::type    type; 
 +      };
 +      
 +      template <typename T1, typename T2, typename OP>
 +      struct cpu_value_type<viennacl::vector_expression<T1, T2, OP> >
 +      {
 +        typedef typename cpu_value_type<T1>::type    type; 
 +      };
 +      
 +      
 +      
 +      template <typename T, typename F, unsigned int ALIGNMENT>
 +      struct cpu_value_type<viennacl::matrix<T, F, ALIGNMENT> >
 +      {
 +        typedef T    type; 
 +      };
 +      
 +      template <typename T>
 +      struct cpu_value_type<viennacl::matrix_range<T> >
 +      {
 +        typedef typename cpu_value_type<T>::type    type; 
 +      };
 +
 +      template <typename T>
 +      struct cpu_value_type<viennacl::matrix_slice<T> >
 +      {
 +        typedef typename cpu_value_type<T>::type    type; 
 +      };
 +      
 +      template <typename T1, typename T2, typename OP>
 +      struct cpu_value_type<viennacl::matrix_expression<T1, T2, OP> >
 +      {
 +        typedef typename cpu_value_type<T1>::type    type; 
 +      };
 +      
 +      
 +    #ifdef VIENNACL_HAVE_EIGEN  
 +      template <>
 +      struct value_type<Eigen::MatrixXf>
 +      {
 +        typedef Eigen::MatrixXf::RealScalar    type; 
 +      };
 +      
 +      template <>
 +      struct value_type<Eigen::MatrixXd>
 +      {
 +        typedef Eigen::MatrixXd::RealScalar    type; 
 +      };
 +
 +      template <typename ScalarType, int option>
 +      struct value_type<Eigen::SparseMatrix<ScalarType, option> >
 +      {
 +        typedef ScalarType    type; 
 +      };
 +
 +      template <>
 +      struct value_type<Eigen::VectorXf>
 +      {
 +        typedef Eigen::VectorXf::RealScalar    type; 
 +      };
 +
 +      template <>
 +      struct value_type<Eigen::VectorXd>
 +      {
 +        typedef Eigen::VectorXd::RealScalar    type; 
 +      };
 +      
 +    #endif
 +      
 +      
 +      
 +      template <typename T>
 +      struct matrix_expression_internal_storage
 +      {
 +        typedef T &     type;
 +      };
 +     
 +      template <>
 +      struct matrix_expression_internal_storage<const float>
 +      {
 +        typedef float type;
 +      };
 +      
 +      template <>
 +      struct matrix_expression_internal_storage<const double>
 +      {
 +        typedef double type;
 +      };
 +      
 +      
 +      
 +      
 +      //
 +      // Deduce compatible vector type for a matrix type
 +      //
 +
 +      template <typename T>
 +      struct vector_for_matrix
 +      {
 +        typedef typename T::ERROR_CANNOT_DEDUCE_VECTOR_FOR_MATRIX_TYPE   type;
 +      };
 +
 +      //ViennaCL
 +      template <typename T, typename F, unsigned int A>
 +      struct vector_for_matrix< viennacl::matrix<T, F, A> >
 +      {
 +        typedef viennacl::vector<T,A>   type;
 +      };
 +
 +      template <typename T, unsigned int A>
 +      struct vector_for_matrix< viennacl::compressed_matrix<T, A> >
 +      {
 +        typedef viennacl::vector<T,A>   type;
 +      };
 +
 +      template <typename T, unsigned int A>
 +      struct vector_for_matrix< viennacl::coordinate_matrix<T, A> >
 +      {
 +        typedef viennacl::vector<T,A>   type;
 +      };
 +
 +      #ifdef VIENNACL_HAVE_UBLAS
 +      //Boost:
 +      template <typename T, typename F, typename A>
 +      struct vector_for_matrix< boost::numeric::ublas::matrix<T, F, A> >
 +      {
 +        typedef boost::numeric::ublas::vector<T>   type;
 +      };
 +
 +      template <typename T, typename U, std::size_t A, typename B, typename C>
 +      struct vector_for_matrix< boost::numeric::ublas::compressed_matrix<T, U, A, B, C> >
 +      {
 +        typedef boost::numeric::ublas::vector<T>   type;
 +      };
 +
 +      template <typename T, typename U, std::size_t A, typename B, typename C>
 +      struct vector_for_matrix< boost::numeric::ublas::coordinate_matrix<T, U, A, B, C> >
 +      {
 +        typedef boost::numeric::ublas::vector<T>   type;
 +      };
 +      #endif
 +
 +      
 +      
 +    } //namespace result_of
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_META_RESULT_OF_HPP_
+ #define VIENNACL_META_RESULT_OF_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/meta/result_of.hpp
+     @brief A collection of compile time type deductions
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ 
+ 
+ #ifdef VIENNACL_WITH_UBLAS
+ #include <boost/numeric/ublas/matrix_sparse.hpp>
+ #include <boost/numeric/ublas/matrix.hpp>
+ #endif
+ 
+ #ifdef VIENNACL_WITH_EIGEN
+ #include <Eigen/Core>
+ #include <Eigen/Sparse>
+ #endif
+ 
+ #ifdef VIENNACL_WITH_MTL4
+ #include <boost/numeric/mtl/mtl.hpp>
+ #endif
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+ #ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #else
+ #include "CL/cl.h"
+ #endif
+ #endif
+ 
+ #include <vector>
+ #include <map>
+ 
+ namespace viennacl
+ {
+     namespace result_of
+     {
+       //
+       // Retrieve alignment from vector
+       //
+       /** @brief Retrieves the alignment from a vector. Deprecated - will be replaced by a pure runtime facility in the future. */
+       template <typename T>
+       struct alignment
+       {
+         typedef typename T::ERROR_ARGUMENT_PROVIDED_IS_NOT_A_VECTOR_OR_A_MATRIX   error_type;
+         enum { value = 1 };
+       };
+ 
+       /** \cond */
+       template <typename T>
+       struct alignment<const T>
+       {
+         enum { value = alignment<T>::value };
+       };
+ 
+       template <typename SCALARTYPE, unsigned int ALIGNMENT>
+       struct alignment< vector<SCALARTYPE, ALIGNMENT> >
+       {
+         enum { value = ALIGNMENT };
+       };
+ 
+       template <typename T>
+       struct alignment< vector_range<T> >
+       {
+         enum { value = alignment<T>::value };
+       };
+ 
+       template <typename T>
+       struct alignment< vector_slice<T> >
+       {
+         enum { value = alignment<T>::value };
+       };
+ 
+       // support for a*x with scalar a and vector x
+       template <typename LHS, typename RHS, typename OP>
+       struct alignment< vector_expression<LHS, RHS, OP> >
+       {
+         enum { value = alignment<LHS>::value };
+       };
+ 
+ 
+       // Matrices
+       template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+       struct alignment< matrix<SCALARTYPE, F, ALIGNMENT> >
+       {
+         enum { value = ALIGNMENT };
+       };
+ 
+       template <typename T>
+       struct alignment< matrix_range<T> >
+       {
+         enum { value = alignment<T>::value };
+       };
+ 
+       template <typename T>
+       struct alignment< matrix_slice<T> >
+       {
+         enum { value = alignment<T>::value };
+       };
+ 
+       template <typename LHS, typename RHS>
+       struct alignment< matrix_expression<LHS, RHS, op_trans> >
+       {
+         enum { value = alignment<LHS>::value };
+       };
+       /** \endcond */
+ 
+       //
+       // Majority specifier for matrices (row_major, column_major)
+       //
+       /** @brief Returns the orientation functor tag (either row_major or column_major) of a matrix */
+       template <typename T>
+       struct orientation_functor
+       {
+         typedef typename T::ERROR_ARGUMENT_PROVIDED_IS_NOT_A_MATRIX     type;
+       };
+ 
+       /** \cond */
+       template <typename T>
+       struct orientation_functor<const T>
+       {
+         typedef typename orientation_functor<T>::type  type;
+       };
+ 
+       template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+       struct orientation_functor< matrix<SCALARTYPE, F, ALIGNMENT> >
+       {
+         typedef F     type;
+       };
+ 
+       template <typename T>
+       struct orientation_functor< matrix_range<T> >
+       {
+         typedef typename orientation_functor<T>::type  type;
+       };
+ 
+       template <typename T>
+       struct orientation_functor< matrix_slice<T> >
+       {
+         typedef typename orientation_functor<T>::type  type;
+       };
+ 
+       template <typename SCALARTYPE, typename F>
+       struct orientation_functor< matrix_base<SCALARTYPE, F> >
+       {
+         typedef F     type;
+       };
+ 
+       template <typename LHS, typename RHS>
+       struct orientation_functor< matrix_expression<LHS, RHS, op_trans> >
+       {
+         typedef typename orientation_functor<LHS>::type  type;
+       };
+       /** \endcond */
+ 
+ 
+       //
+       // Retrieve size_type
+       //
+       /** @brief Generic meta-function for retrieving the size_type associated with type T */
+       template <typename T>
+       struct size_type
+       {
+         typedef typename T::size_type   type;
+       };
+ 
+       /** \cond */
+       template <typename T, typename SizeType>
+       struct size_type< vector_base<T, SizeType> >
+       {
+         typedef SizeType   type;
+       };
+ 
+       #ifdef VIENNACL_WITH_EIGEN
+       template <class T, int a, int b, int c, int d, int e>
+       struct size_type< Eigen::Matrix<T, a, b, c, d, e> >
+       {
+         typedef vcl_size_t   type;
+       };
+ 
+       template <>
+       struct size_type<Eigen::VectorXf>
+       {
+         typedef vcl_size_t   type;
+       };
+ 
+       template <>
+       struct size_type<Eigen::VectorXd>
+       {
+         typedef vcl_size_t   type;
+       };
+ 
+       template <typename T, int options>
+       struct size_type<Eigen::SparseMatrix<T, options> >
+       {
+         typedef vcl_size_t   type;
+       };
+       #endif
+       /** \endcond */
+ 
+       //
+       // Retrieve value_type:
+       //
+       /** @brief Generic helper function for retrieving the value_type associated with type T */
+       template <typename T>
+       struct value_type
+       {
+         typedef typename T::value_type    type;
+       };
+ 
+       /** \cond */
+ #ifdef VIENNACL_WITH_EIGEN
+       template <>
+       struct value_type<Eigen::MatrixXf>
+       {
+         typedef Eigen::MatrixXf::RealScalar    type;
+       };
+ 
+       template <>
+       struct value_type<Eigen::MatrixXd>
+       {
+         typedef Eigen::MatrixXd::RealScalar    type;
+       };
+ 
+       template <typename ScalarType, int option>
+       struct value_type<Eigen::SparseMatrix<ScalarType, option> >
+       {
+         typedef ScalarType    type;
+       };
+ 
+       template <>
+       struct value_type<Eigen::VectorXf>
+       {
+         typedef Eigen::VectorXf::RealScalar    type;
+       };
+ 
+       template <>
+       struct value_type<Eigen::VectorXd>
+       {
+         typedef Eigen::VectorXd::RealScalar    type;
+       };
+ 
+ #endif
+       /** \endcond */
+ 
+ 
+       //
+       // Retrieve cpu value_type:
+       //
+       /** @brief Helper meta function for retrieving the main RAM-based value type. Particularly important to obtain T from viennacl::scalar<T> in a generic way. */
+       template <typename T>
+       struct cpu_value_type
+       {
+         typedef typename T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T    type;
+       };
+ 
+       /** \cond */
+       template <typename T>
+       struct cpu_value_type<const T>
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<char>
+       {
+         typedef char    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<unsigned char>
+       {
+         typedef unsigned char    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<short>
+       {
+         typedef short    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<unsigned short>
+       {
+         typedef unsigned short    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<int>
+       {
+         typedef int    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<unsigned int>
+       {
+         typedef unsigned int    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<long>
+       {
+         typedef int    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<unsigned long>
+       {
+         typedef unsigned long    type;
+       };
+ 
+ 
+       template <>
+       struct cpu_value_type<float>
+       {
+         typedef float    type;
+       };
+ 
+       template <>
+       struct cpu_value_type<double>
+       {
+         typedef double    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::scalar<T> >
+       {
+         typedef T    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::vector_base<T> >
+       {
+         typedef T    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::implicit_vector_base<T> >
+       {
+         typedef T    type;
+       };
+ 
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::vector<T, ALIGNMENT> >
+       {
+         typedef T    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::vector_range<T> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::vector_slice<T> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T1, typename T2, typename OP>
+       struct cpu_value_type<viennacl::vector_expression<const T1, const T2, OP> >
+       {
+         typedef typename cpu_value_type<T1>::type    type;
+       };
+ 
+       template <typename T1, typename T2, typename OP>
+       struct cpu_value_type<const viennacl::vector_expression<const T1, const T2, OP> >
+       {
+         typedef typename cpu_value_type<T1>::type    type;
+       };
+ 
+ 
+       template <typename T, typename F>
+       struct cpu_value_type<viennacl::matrix_base<T, F> >
+       {
+         typedef T    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::implicit_matrix_base<T> >
+       {
+         typedef T    type;
+       };
+ 
+ 
+       template <typename T, typename F, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::matrix<T, F, ALIGNMENT> >
+       {
+         typedef T    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::matrix_range<T> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::matrix_slice<T> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::compressed_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T>
+       struct cpu_value_type<viennacl::compressed_compressed_matrix<T> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::coordinate_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::ell_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::hyb_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::circulant_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::hankel_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::toeplitz_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T, unsigned int ALIGNMENT>
+       struct cpu_value_type<viennacl::vandermonde_matrix<T, ALIGNMENT> >
+       {
+         typedef typename cpu_value_type<T>::type    type;
+       };
+ 
+       template <typename T1, typename T2, typename OP>
+       struct cpu_value_type<viennacl::matrix_expression<T1, T2, OP> >
+       {
+         typedef typename cpu_value_type<T1>::type    type;
+       };
+ 
+ 
+       //
+       // Deduce compatible vector type for a matrix type
+       //
+ 
+       template <typename T>
+       struct vector_for_matrix
+       {
+         typedef typename T::ERROR_CANNOT_DEDUCE_VECTOR_FOR_MATRIX_TYPE   type;
+       };
+ 
+       //ViennaCL
+       template <typename T, typename F, unsigned int A>
+       struct vector_for_matrix< viennacl::matrix<T, F, A> >
+       {
+         typedef viennacl::vector<T,A>   type;
+       };
+ 
+       template <typename T, unsigned int A>
+       struct vector_for_matrix< viennacl::compressed_matrix<T, A> >
+       {
+         typedef viennacl::vector<T,A>   type;
+       };
+ 
+       template <typename T, unsigned int A>
+       struct vector_for_matrix< viennacl::coordinate_matrix<T, A> >
+       {
+         typedef viennacl::vector<T,A>   type;
+       };
+ 
+       #ifdef VIENNACL_WITH_UBLAS
+       //Boost:
+       template <typename T, typename F, typename A>
+       struct vector_for_matrix< boost::numeric::ublas::matrix<T, F, A> >
+       {
+         typedef boost::numeric::ublas::vector<T>   type;
+       };
+ 
+       template <typename T, typename U, vcl_size_t A, typename B, typename C>
+       struct vector_for_matrix< boost::numeric::ublas::compressed_matrix<T, U, A, B, C> >
+       {
+         typedef boost::numeric::ublas::vector<T>   type;
+       };
+ 
+       template <typename T, typename U, vcl_size_t A, typename B, typename C>
+       struct vector_for_matrix< boost::numeric::ublas::coordinate_matrix<T, U, A, B, C> >
+       {
+         typedef boost::numeric::ublas::vector<T>   type;
+       };
+       #endif
+ 
+ 
+       template <typename T>
+       struct reference_if_nonscalar
+       {
+         typedef T &    type;
+       };
+ 
+ #define VIENNACL_REFERENCE_IF_NONSCALAR_INT(TNAME) \
+       template <> struct reference_if_nonscalar<TNAME>                { typedef                TNAME  type; }; \
+       template <> struct reference_if_nonscalar<const TNAME>          { typedef          const TNAME  type; }; \
+       template <> struct reference_if_nonscalar<unsigned TNAME>       { typedef       unsigned TNAME  type; }; \
+       template <> struct reference_if_nonscalar<const unsigned TNAME> { typedef const unsigned TNAME  type; };
+ 
+       VIENNACL_REFERENCE_IF_NONSCALAR_INT(char)
+       VIENNACL_REFERENCE_IF_NONSCALAR_INT(short)
+       VIENNACL_REFERENCE_IF_NONSCALAR_INT(int)
+       VIENNACL_REFERENCE_IF_NONSCALAR_INT(long)
+ 
+ #undef VIENNACL_REFERENCE_IF_NONSCALAR_INT
+ 
+       template <>
+       struct reference_if_nonscalar<float>
+       {
+         typedef float    type;
+       };
+ 
+       template <>
+       struct reference_if_nonscalar<const float>
+       {
+         typedef const float    type;
+       };
+ 
+       template <>
+       struct reference_if_nonscalar<double>
+       {
+         typedef double    type;
+       };
+ 
+       template <>
+       struct reference_if_nonscalar<const double>
+       {
+         typedef const double    type;
+       };
+ 
+       /** \endcond */
+ 
+       //OpenCL equivalent type
+       /** @brief Metafunction for deducing the OpenCL type for a numeric type, e.g. float -> cl_float */
+       template<typename T>
+       struct cl_type
+       {
+           typedef T type;
+       };
+ 
+       /** \cond */
+ #ifdef VIENNACL_WITH_OPENCL
+       template<>
+       struct cl_type<float>{ typedef cl_float type; };
+ 
+       template<>
+       struct cl_type<double>{ typedef cl_double type; };
+ 
+       template<>
+       struct cl_type<int>{ typedef cl_int type; };
+ 
+       template<>
+       struct cl_type<unsigned int>{  typedef cl_uint type; };
+ 
+       template<>
+       struct cl_type<long>{  typedef cl_long type;  };
+ 
+       template<>
+       struct cl_type<unsigned long>{ typedef cl_ulong type; };
+ 
+       template<>
+       struct cl_type<short>{ typedef cl_short type;  };
+ 
+       template<>
+       struct cl_type<unsigned short>{ typedef cl_ushort type; };
+ 
+       template<>
+       struct cl_type<char>{ typedef cl_char type; };
+ 
+       template<>
+       struct cl_type<unsigned char>{ typedef cl_uchar type; };
+ #endif
+       /** \endcond */
+ 
+     } //namespace result_of
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/meta/tag_of.hpp
index edeeaf4,8329e61..3ef40ca
--- a/viennacl/meta/tag_of.hpp
+++ b/viennacl/meta/tag_of.hpp
@@@ -239,7 -252,7 +252,11 @@@ namespace viennac
      {
        typedef viennacl::tag_viennacl  type;
      };
++<<<<<<< HEAD
 +    
++=======
+ 
++>>>>>>> upstream/1.5.1
      template< typename T, unsigned int I>
      struct tag_of< viennacl::circulant_matrix<T,I> >
      {
diff --cc viennacl/ocl/backend.hpp
index a5e25f8,b171e76..7157de1
--- a/viennacl/ocl/backend.hpp
+++ b/viennacl/ocl/backend.hpp
@@@ -153,12 -160,18 +160,27 @@@ namespace viennac
            contexts_[i].default_device_type(t);
          }
  
++<<<<<<< HEAD
 +        /** @brief Sets the context device type */
 +        static void set_context_platform_index(long i, std::size_t pf_index)
 +        {
 +          contexts_[i].platform_index(pf_index);
 +        }
 +        
++=======
+         /** @brief Sets the maximum number of devices per context. Ignored if a device array is provided as well.  */
+         static void set_context_device_num(long i, vcl_size_t num)
+         {
+           contexts_[i].default_device_num(num);
+         }
+ 
+         /** @brief Sets the context device type */
+         static void set_context_platform_index(long i, vcl_size_t pf_index)
+         {
+           contexts_[i].platform_index(pf_index);
+         }
+ 
++>>>>>>> upstream/1.5.1
        private:
          static long current_context_id_;
          static std::map<long, bool> initialized_;
@@@ -250,17 -277,23 +286,37 @@@
        set_context_device_type(i, CL_DEVICE_TYPE_ACCELERATOR);
      }
  
++<<<<<<< HEAD
 +    
 +    /** @brief Convenience function for setting the platform index
 +     * 
 +     * @param i         Context ID
 +     * @param pf_index  The platform index as returned by clGetPlatformIDs(). This is not the ID of type cl_platform_id!
 +     */
 +    inline void set_context_platform_index(long i, std::size_t pf_index)
 +    {
 +      viennacl::ocl::backend<>::set_context_platform_index(i, pf_index);
 +    }
 +    
++=======
+     /** @brief Convenience function for setting the number of default devices per context */
+     inline void set_context_device_num(long i, vcl_size_t num)
+     {
+       viennacl::ocl::backend<>::set_context_device_num(i, num);
+     }
+ 
+ 
+     /** @brief Convenience function for setting the platform index
+      *
+      * @param i         Context ID
+      * @param pf_index  The platform index as returned by clGetPlatformIDs(). This is not the ID of type cl_platform_id!
+      */
+     inline void set_context_platform_index(long i, vcl_size_t pf_index)
+     {
+       viennacl::ocl::backend<>::set_context_platform_index(i, pf_index);
+     }
+ 
++>>>>>>> upstream/1.5.1
      ///////////////////////// get queues ///////////////////
      /** @brief Convenience function for getting the default queue for the currently active device in the active context */
      inline viennacl::ocl::command_queue & get_queue()
diff --cc viennacl/ocl/context.hpp
index 25b49de,c782adc..67fc621
--- a/viennacl/ocl/context.hpp
+++ b/viennacl/ocl/context.hpp
@@@ -48,17 -55,18 +55,25 @@@ namespace viennac
        public:
          context() : initialized_(false),
                      device_type_(CL_DEVICE_TYPE_DEFAULT),
++<<<<<<< HEAD
 +                    current_device_id(0),
 +                    default_device_num_(1),
 +                    pf_index_(0) {}
 +        
++=======
+                     current_device_id_(0),
+                     default_device_num_(1),
+                     pf_index_(0),
+                     current_queue_id_(0) {}
+ 
++>>>>>>> upstream/1.5.1
          //////// Get and set default number of devices per context */
          /** @brief Returns the maximum number of devices to be set up for the context */
-         std::size_t default_device_num() const { return default_device_num_; }
-         
+         vcl_size_t default_device_num() const { return default_device_num_; }
+ 
          /** @brief Sets the maximum number of devices to be set up for the context */
-         void default_device_num(std::size_t new_num) { default_device_num_ = new_num; }
-         
+         void default_device_num(vcl_size_t new_num) { default_device_num_ = new_num; }
+ 
          ////////// get and set preferred device type /////////////////////
          /** @brief Returns the default device type for the context */
          cl_device_type default_device_type()
@@@ -263,13 -354,17 +361,24 @@@
            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
            std::cout << "ViennaCL: Adding program '" << prog_name << "' to context " << h_ << std::endl;
            #endif
-           
-           viennacl::ocl::handle<cl_program> temp = clCreateProgramWithSource(h_.get(), 1, (const char **)&source_text, &source_size, &err);
+ 
+           //
+           // Build program
+           //
+           cl_program temp = clCreateProgramWithSource(h_.get(), 1, (const char **)&source_text, &source_size, &err);
            VIENNACL_ERR_CHECK(err);
++<<<<<<< HEAD
 +          
 +          const char * options = build_options_.c_str();
 +          err = clBuildProgram(temp.get(), 0, NULL, options, NULL, NULL);
 +          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_BUILD)
++=======
+ 
+           const char * options = build_options_.c_str();
+           err = clBuildProgram(temp, 0, NULL, options, NULL, NULL);
+           if (err != CL_SUCCESS)
+           {
++>>>>>>> upstream/1.5.1
              char buffer[8192];
              cl_build_status status;
              clGetProgramBuildInfo(temp, devices_[0].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
@@@ -299,42 -424,73 +438,92 @@@
                return *it;
            }
            std::cerr << "Could not find program '" << name << "'" << std::endl;
-           assert(!"In class 'context': name invalid in get_program()");
-           return programs_[0];  //return a defined object
+           throw "In class 'context': name invalid in get_program()";
+           //return programs_[0];  //return a defined object
          }
-         
+ 
+         viennacl::ocl::program const & get_program(std::string const & name) const
+         {
+           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
+           std::cout << "ViennaCL: Getting program '" << name << "' from context " << h_ << std::endl;
+           #endif
+           for (ProgramContainer::const_iterator it = programs_.begin();
+                 it != programs_.end();
+                 ++it)
+           {
+             if (it->name() == name)
+               return *it;
+           }
+           std::cerr << "Could not find program '" << name << "'" << std::endl;
+           throw "In class 'context': name invalid in get_program()";
+           //return programs_[0];  //return a defined object
+         }
+ 
+         /** @brief Returns whether the program with the provided name exists or not */
+         bool has_program(std::string const & name){
+             for (ProgramContainer::iterator it = programs_.begin();
+                   it != programs_.end();
+                   ++it)
+             {
+               if (it->name() == name) return true;
+             }
+             return false;
+         }
+ 
          /** @brief Returns the program with the provided id */
-         viennacl::ocl::program & get_program(size_t id)
+         viennacl::ocl::program & get_program(vcl_size_t id)
          {
-           assert(id >= 0 && id < programs_.size() && "In class 'context': id invalid in get_program()");
+           assert(id < programs_.size() && bool("In class 'context': id invalid in get_program()"));
            return programs_[id];
          }
-         
+ 
          /** @brief Returns the number of programs within this context */
-         size_t program_num() { return programs_.size(); }
+         vcl_size_t program_num() { return programs_.size(); }
+ 
+         /** @brief Convenience function for retrieving the kernel of a program directly from the context */
+         viennacl::ocl::kernel & get_kernel(std::string const & program_name, std::string const & kernel_name) { return get_program(program_name).get_kernel(kernel_name); }
  
          /** @brief Returns the number of devices within this context */
-         size_t device_num() { return devices_.size(); }
-         
+         vcl_size_t device_num() { return devices_.size(); }
+ 
          /** @brief Returns the context handle */
          const viennacl::ocl::handle<cl_context> & handle() const { return h_; }
++<<<<<<< HEAD
 +        
 +        /** @brief Returns the current build option string */
 +        std::string build_options() const { return build_options_; }
 +        
++=======
+ 
+         /** @brief Returns the current build option string */
+         std::string build_options() const { return build_options_; }
+ 
++>>>>>>> upstream/1.5.1
          /** @brief Sets the build option string, which is passed to the OpenCL compiler in subsequent compilations. Does not effect programs already compiled previously. */
          void build_options(std::string op) { build_options_ = op; }
  
          /** @brief Returns the platform ID of the platform to be used for the context */
++<<<<<<< HEAD
 +        std::size_t platform_index() const  { return pf_index_; }
 +
 +        /** @brief Sets the platform ID of the platform to be used for the context */
 +        void platform_index(std::size_t new_index)
 +        {
 +          assert(!initialized_ && "Platform ID must be set before context is initialized!");
 +          pf_index_ = new_index; 
 +        }
 +        
++=======
+         vcl_size_t platform_index() const  { return pf_index_; }
+ 
+         /** @brief Sets the platform ID of the platform to be used for the context */
+         void platform_index(vcl_size_t new_index)
+         {
+           assert(!initialized_ && bool("Platform ID must be set before context is initialized!"));
+           pf_index_ = new_index;
+         }
+ 
++>>>>>>> upstream/1.5.1
          /** @brief Less-than comparable for compatibility with std:map  */
          bool operator<(context const & other) const
          {
@@@ -359,7 -520,7 +553,11 @@@
              #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
              std::cout << "ViennaCL: Setting all devices for context..." << std::endl;
              #endif
++<<<<<<< HEAD
 +            
++=======
+ 
++>>>>>>> upstream/1.5.1
              platform pf(pf_index_);
              std::vector<device> devices = pf.devices(device_type_);
              #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
@@@ -457,9 -619,66 +656,70 @@@
          ProgramContainer programs_;
          std::map< cl_device_id, std::vector< viennacl::ocl::command_queue> > queues_;
          std::string build_options_;
++<<<<<<< HEAD
 +        std::size_t pf_index_;
++=======
+         vcl_size_t pf_index_;
+         vcl_size_t current_queue_id_;
++>>>>>>> upstream/1.5.1
      }; //context
-     
+ 
+ 
+ 
+     /** @brief Adds a kernel to the program */
+     inline viennacl::ocl::kernel & viennacl::ocl::program::add_kernel(cl_kernel kernel_handle, std::string const & kernel_name)
+     {
+       assert(p_context_ != NULL && bool("Pointer to context invalid in viennacl::ocl::program object"));
+       viennacl::ocl::kernel temp(kernel_handle, *this, *p_context_, kernel_name);
+       kernels_.push_back(temp);
+       return kernels_.back();
+     }
+ 
+     /** @brief Returns the kernel with the provided name */
+     inline viennacl::ocl::kernel & viennacl::ocl::program::get_kernel(std::string const & name)
+     {
+       //std::cout << "Requiring kernel " << name << " from program " << name_ << std::endl;
+       for (KernelContainer::iterator it = kernels_.begin();
+             it != kernels_.end();
+            ++it)
+       {
+         if (it->name() == name)
+           return *it;
+       }
+       std::cerr << "ViennaCL: FATAL ERROR: Could not find kernel '" << name << "' from program '" << name_ << "'" << std::endl;
+       std::cout << "Number of kernels in program: " << kernels_.size() << std::endl;
+       throw "Kernel not found";
+       //return kernels_[0];  //return a defined object
+     }
+ 
+ 
+     inline void viennacl::ocl::kernel::set_work_size_defaults()
+     {
+       assert( p_program_ != NULL && bool("Kernel not initialized, program pointer invalid."));
+       assert( p_context_ != NULL && bool("Kernel not initialized, context pointer invalid."));
+ 
+       if (   (p_context_->current_device().type() == CL_DEVICE_TYPE_GPU)
+           || (p_context_->current_device().type() == CL_DEVICE_TYPE_ACCELERATOR) // Xeon Phi
+          )
+       {
+         local_work_size_[0] = 128;      local_work_size_[1] = 0;  local_work_size_[2] = 0;
+         global_work_size_[0] = 128*128; global_work_size_[1] = 0; global_work_size_[2] = 0;
+       }
+       else //assume CPU type:
+       {
+         //conservative assumption: one thread per CPU core:
+         local_work_size_[0] = 1; local_work_size_[1] = 0; local_work_size_[2] = 0;
+ 
+         size_type units = p_context_->current_device().max_compute_units();
+         size_type s = 1;
+ 
+         while (s < units) // find next power of 2. Important to make reductions work on e.g. six-core CPUs.
+           s *= 2;
+ 
+         global_work_size_[0] = s; global_work_size_[1] = 0; global_work_size_[2] = 0;
+       }
+     }
+ 
    }
  }
  
diff --cc viennacl/ocl/enqueue.hpp
index 422289c,f2af576..27cdbf8
--- a/viennacl/ocl/enqueue.hpp
+++ b/viennacl/ocl/enqueue.hpp
@@@ -1,157 -1,129 +1,289 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_OCL_ENQUEUE_HPP_
 +#define VIENNACL_OCL_ENQUEUE_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file enqueue.hpp
 +    @brief Enqueues kernels into command queues
 +*/
 +
 +#ifdef __APPLE__
 +#include <OpenCL/cl.h>
 +#else
 +#include <CL/cl.h>
 +#endif
 +
 +#include "viennacl/ocl/kernel.hpp"
 +#include "viennacl/ocl/command_queue.hpp"
 +
 +namespace viennacl
 +{
 +  namespace generator{
 +      class custom_operation;
 +      void enqueue_custom_op(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue);
 +  }
 +  
 +  namespace ocl
 +  {
 +
 +    /** @brief Enqueues a kernel in the provided queue */
 +    template <typename KernelType>
 +    void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue)
 +    {
 +      // 1D kernel:
 +      if (k.local_work_size(1) == 0)
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl;
 +        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size() << "'..." << std::endl;
 +        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size() << "'..." << std::endl;
 +        #endif
 +      
 +        size_t tmp_global = k.global_work_size();
 +        size_t tmp_local = k.local_work_size();
 +        
 +        cl_int err;
 +        if (tmp_global == 1 && tmp_local == 1)
 +          err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL);
 +        else
 +          err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
 +
 +        if (err != CL_SUCCESS)  //if not successful, try to start with smaller work size
 +        {
 +          //std::cout << "FAIL: " << std::endl; exit(0);
 +          while (err != CL_SUCCESS && tmp_local > 1)
 +          {
 +            //std::cout << "Flushing queue, then enqueuing again with half the size..." << std::endl;
 +            //std::cout << "Error code: " << err << std::endl;
 +            
 +            tmp_global /= 2;
 +            tmp_local /= 2;
 +
 +            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +            std::cout << "ViennaCL: Kernel start failed for '" << k.name() << "'." << std::endl;
 +            std::cout << "ViennaCL: Global work size: '"  << tmp_global << "'..." << std::endl;
 +            std::cout << "ViennaCL: Local work size: '"   << tmp_local << "'..." << std::endl;
 +            #endif
 +            
 +            queue.finish();
 +            err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
 +          }
 +          
 +          if (err != CL_SUCCESS)
 +          {
 +            //could not start kernel with any parameters
 +            std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
 +            std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl;
 +            VIENNACL_ERR_CHECK(err);
 +          }
 +          else
 +          {
 +            //remember parameters:
 +            k.local_work_size(0, tmp_local);
 +            k.global_work_size(0, tmp_global);
 +            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +            std::cout << "ViennaCL: Kernel '" << k.name() << "' now uses global work size " << tmp_global << " and local work size " << tmp_local << "."  << std::endl;
 +            #endif
 +          }          
 +        }
 +      }
 +      else //2D kernel
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Starting 2D-kernel '" << k.name() << "'..." << std::endl;
 +        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size(0) << ", " << k.global_work_size(1) << "'..." << std::endl;
 +        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size(0) << ", " << k.local_work_size(1) << "'..." << std::endl;
 +        #endif
 +
 +        size_t tmp_global[2]; 
 +        tmp_global[0] = k.global_work_size(0);
 +        tmp_global[1] = k.global_work_size(1);
 +        
 +        size_t tmp_local[2];
 +        tmp_local[0] = k.local_work_size(0);
 +        tmp_local[1] = k.local_work_size(1);
 +        
 +        cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 2, NULL, tmp_global, tmp_local, 0, NULL, NULL);
 +
 +        if (err != CL_SUCCESS)
 +        {
 +          //could not start kernel with any parameters
 +          std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
 +          VIENNACL_ERR_CHECK(err);
 +        }
 +        
 +      }
 +            
 +      #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +      queue.finish();
 +      std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl;
 +      #endif
 +    } //enqueue()
 +    
 +    
 +    /** @brief Convenience function that enqueues the provided kernel into the first queue of the currently active device in the currently active context */
 +    template <typename KernelType>
 +    void enqueue(KernelType & k)
 +    {
 +      enqueue(k, viennacl::ocl::current_context().get_queue());
 +    }
 +    
 +    inline void enqueue(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue)
 +    {
 +      generator::enqueue_custom_op(op,queue);
 +    }
 +
 +    inline void enqueue(viennacl::generator::custom_operation & op)
 +    {
 +      enqueue(op, viennacl::ocl::current_context().get_queue());
 +    }
 +    
 +  } // namespace ocl
 +} // namespace viennacl
 +#endif
++=======
+ #ifndef VIENNACL_OCL_ENQUEUE_HPP_
+ #define VIENNACL_OCL_ENQUEUE_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/ocl/enqueue.hpp
+     @brief Enqueues kernels into command queues
+ */
+ 
+ #ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #else
+ #include <CL/cl.h>
+ #endif
+ 
+ #include "viennacl/ocl/backend.hpp"
+ #include "viennacl/ocl/kernel.hpp"
+ #include "viennacl/ocl/command_queue.hpp"
+ #include "viennacl/ocl/context.hpp"
+ 
+ namespace viennacl
+ {
+   namespace generator{
+       class custom_operation;
+       void enqueue_custom_op(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue);
+   }
+ 
+   namespace ocl
+   {
+ 
+     /** @brief Enqueues a kernel in the provided queue */
+     template <typename KernelType>
+     void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue)
+     {
+       // 1D kernel:
+       if (k.local_work_size(1) == 0)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl;
+         std::cout << "ViennaCL: Global work size: '"  << k.global_work_size() << "'..." << std::endl;
+         std::cout << "ViennaCL: Local work size: '"   << k.local_work_size() << "'..." << std::endl;
+         #endif
+ 
+         vcl_size_t tmp_global = k.global_work_size();
+         vcl_size_t tmp_local = k.local_work_size();
+ 
+         cl_int err;
+         if (tmp_global == 1 && tmp_local == 1)
+           err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL);
+         else
+           err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
+ 
+         if (err != CL_SUCCESS)
+         {
+           std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
+           std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl;
+           VIENNACL_ERR_CHECK(err);
+         }
+       }
+       else //2D or 3D kernel
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Starting 2D/3D-kernel '" << k.name() << "'..." << std::endl;
+         std::cout << "ViennaCL: Global work size: '"  << k.global_work_size(0) << ", " << k.global_work_size(1) << ", " << k.global_work_size(2) << "'..." << std::endl;
+         std::cout << "ViennaCL: Local work size: '"   << k.local_work_size(0) << ", " << k.local_work_size(1) << ", " << k.local_work_size(2) << "'..." << std::endl;
+         #endif
+ 
+         vcl_size_t tmp_global[3];
+         tmp_global[0] = k.global_work_size(0);
+         tmp_global[1] = k.global_work_size(1);
+         tmp_global[2] = k.global_work_size(2);
+ 
+         vcl_size_t tmp_local[3];
+         tmp_local[0] = k.local_work_size(0);
+         tmp_local[1] = k.local_work_size(1);
+         tmp_local[2] = k.local_work_size(2);
+ 
+         cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), (tmp_global[2] == 0) ? 2 : 3, NULL, tmp_global, tmp_local, 0, NULL, NULL);
+ 
+         if (err != CL_SUCCESS)
+         {
+           //could not start kernel with any parameters
+           std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
+           VIENNACL_ERR_CHECK(err);
+         }
+       }
+ 
+       #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+       queue.finish();
+       std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl;
+       #endif
+     } //enqueue()
+ 
+ 
+     /** @brief Convenience function that enqueues the provided kernel into the first queue of the currently active device in the currently active context */
+     template <typename KernelType>
+     void enqueue(KernelType & k)
+     {
+       enqueue(k, k.context().get_queue());
+     }
+ 
+     inline void enqueue(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue)
+     {
+       generator::enqueue_custom_op(op,queue);
+     }
+ 
+     inline void enqueue(viennacl::generator::custom_operation & op)
+     {
+       enqueue(op, viennacl::ocl::current_context().get_queue());
+     }
+ 
+   } // namespace ocl
+ } // namespace viennacl
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/ocl/kernel.hpp
index 5085158,5b98b97..4ed012a
--- a/viennacl/ocl/kernel.hpp
+++ b/viennacl/ocl/kernel.hpp
@@@ -1,751 -1,836 +1,1590 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_OCL_KERNEL_HPP_
 +#define VIENNACL_OCL_KERNEL_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file kernel.hpp
 +    @brief Representation of an OpenCL kernel in ViennaCL.
 +*/
 +
 +#ifdef __APPLE__
 +#include <OpenCL/cl.h>
 +#else
 +#include <CL/cl.h>
 +#endif
 +
 +#include "viennacl/ocl/forwards.h"
 +#include "viennacl/ocl/backend.hpp"
 +#include "viennacl/ocl/handle.hpp"
 +#include "viennacl/ocl/program.hpp"
 +#include "viennacl/ocl/device.hpp"
 +#include "viennacl/ocl/local_mem.hpp"
 +
 +namespace viennacl
 +{
 +  namespace ocl
 +  {
 +    
 +    /** @brief Represents an OpenCL kernel within ViennaCL */
 +    class kernel
 +    {
 +      template <typename KernelType>
 +      friend void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
 +      
 +      
 +    public:
 +      typedef std::size_t            size_type;
 +      
 +      kernel() : handle_(0)
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Creating kernel object (default CTOR)" << std::endl;
 +        #endif
 +        set_work_size_defaults();
 +      }
 +      
 +      kernel(viennacl::ocl::handle<cl_program> const & prog, std::string const & name) 
 +       : handle_(0), program_(prog), name_(name), init_done_(false)
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Creating kernel object (full CTOR)" << std::endl;
 +        #endif
 +        set_work_size_defaults();
 +      }
 +      
 +      kernel(kernel const & other) 
 +       : handle_(other.handle_), program_(other.program_), name_(other.name_), init_done_(other.init_done_)
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Creating kernel object (Copy CTOR)" << std::endl;
 +        #endif
 +        local_work_size_[0] = other.local_work_size_[0];
 +        local_work_size_[1] = other.local_work_size_[1];
 +        
 +        global_work_size_[0] = other.global_work_size_[0];
 +        global_work_size_[1] = other.global_work_size_[1];
 +      }
 +      
 +      viennacl::ocl::kernel & operator=(const kernel & other)
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Assigning kernel object" << std::endl;
 +        #endif
 +        handle_ = other.handle_;
 +        program_ = other.program_;
 +        name_ = other.name_;
 +        init_done_ = other.init_done_;
 +        local_work_size_[0] = other.local_work_size_[0];
 +        local_work_size_[1] = other.local_work_size_[1];
 +        global_work_size_[0] = other.global_work_size_[0];
 +        global_work_size_[1] = other.global_work_size_[1];
 +        return *this;
 +      }
 +      
 +      
 +      /** @brief Sets an unsigned integer argument at the provided position */
 +      void arg(unsigned int pos, cl_uint val)
 +      {
 +        init();
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting unsigned long kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
 +        #endif
 +        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uint), (void*)&val);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +
 +      /** @brief Sets a single precision floating point argument at the provided position */
 +      void arg(unsigned int pos, float val)
 +      {
 +        init();
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting floating point kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
 +        #endif
 +        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(float), (void*)&val);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +
 +      /** @brief Sets a double precision floating point argument at the provided position */
 +      void arg(unsigned int pos, double val)
 +      {
 +        init();
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting double precision kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
 +        #endif
 +        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(double), (void*)&val);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +
 +      //generic handling: call .handle() member
 +      /** @brief Sets an OpenCL memory object at the provided position */
 +      template<class VCL_TYPE>
 +      void arg(unsigned int pos, VCL_TYPE const & val)
 +      {
 +        init();
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting generic kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
 +        #endif
 +        cl_mem temp = val.handle().get();
 +        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +      
 +      //forward handles directly:
 +      /** @brief Sets an OpenCL object at the provided position */
 +      template<class CL_TYPE>
 +      void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
 +      {
 +        //arg(pos, h);
 +        init();
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting handle kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
 +        #endif
 +        CL_TYPE temp = h.get();
 +        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +      
 +      
 +      //local buffer argument:
 +      /** @brief Sets an OpenCL local memory object at the provided position */
 +      void arg(unsigned int pos, const local_mem & mem)
 +      {
 +        unsigned int size =  mem.size();
 +        init();
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting local memory kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
 +        #endif
 +        cl_int err = clSetKernelArg(handle_.get(), pos, size, 0);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +      
 +      
 +      
 +      /** @brief Convenience function for setting one kernel parameter */
 +      template <typename T0>
 +      kernel & operator()(T0 const & t0)
 +      {
 +         arg(0, t0);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting two kernel parameters */
 +      template <typename T0, typename T1>
 +      kernel & operator()(T0 const & t0, T1 const & t1)
 +      {
 +         arg(0, t0); arg(1, t1);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting three kernel parameters */
 +      template <typename T0, typename T1, typename T2>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting four kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting five kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting six kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting seven kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting eight kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting nine kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting ten kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4,
 +                typename T5, typename T6, typename T7, typename T8, typename T9>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4,
 +                          T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting eleven kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting twelve kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting thirteen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11, typename T12>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11, T12 const & t12)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11); arg(12, t12);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting fourteen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting fifteen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting sixteen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting seventeen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting eighteen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17)
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting nineteen kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting twenty kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting twentyone kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting twentytwo kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 23 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);  arg(22, t22);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 24 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 25 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 26 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 27 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25, typename T26>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25, T26 const & t26 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25); arg(26, t26);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 28 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25, typename T26, typename T27>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 29 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25, typename T26, typename T27, typename T28>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 30 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 31 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
 +                typename T30>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
 +                          T30 const & t30 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
 +         arg(30, t30);
 +         return *this;
 +      }     
 +
 +      /** @brief Convenience function for setting 32 kernel parameters */
 +      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
 +                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
 +                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
 +                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
 +                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
 +                typename T30, typename T31>
 +      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
 +                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
 +                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
 +                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
 +                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
 +                          T30 const & t30, T31 const & t31 
 +                         )
 +      {
 +         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
 +         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
 +         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
 +         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
 +         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
 +         arg(30, t30); arg(31, t31);
 +         return *this;
 +      }     
 +
 +
 +
 +
 +      /** @brief Returns the local work size at the respective dimension
 +      *
 +      * @param index   Dimension index (currently either 0 or 1)
 +      */
 +      size_type local_work_size(int index = 0) const
 +      {
 +        assert(index == 0 || index == 1);
 +        return local_work_size_[index];
 +      }
 +      /** @brief Returns the global work size at the respective dimension
 +      *
 +      * @param index   Dimension index (currently either 0 or 1)
 +      */
 +      size_type global_work_size(int index = 0) const
 +      { 
 +        assert(index == 0 || index == 1);
 +        return global_work_size_[index];
 +      }
 +
 +      /** @brief Sets the local work size at the respective dimension
 +      *
 +      * @param index   Dimension index (currently either 0 or 1)
 +      * @param s       The new local work size
 +      */
 +      void local_work_size(int index, size_type s)
 +      {
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting local work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
 +        #endif
 +        assert(index == 0 || index == 1);
 +        local_work_size_[index] = s;
 +      }
 +      /** @brief Sets the global work size at the respective dimension
 +      *
 +      * @param index   Dimension index (currently either 0 or 1)
 +      * @param s       The new global work size
 +      */
 +      void global_work_size(int index, size_type s)
 +      { 
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Setting global work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
 +        #endif
 +        assert(index == 0 || index == 1);
 +        global_work_size_[index] = s;
 +      }
 +
 +      std::string const & name() const { return name_; }
 +
 +      viennacl::ocl::handle<cl_kernel> const & handle() const { return handle_; }
 +
 +
 +    private:
 +      void create_kernel()
 +      {
 +        cl_int err;
 +        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +        std::cout << "ViennaCL: Building kernel " << name_ << std::endl;
 +        #endif
 +        handle_ = clCreateKernel(program_.get(), name_.c_str(), &err);
 +        
 +        if (err != CL_SUCCESS)
 +        {
 +          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
 +          std::cout << "ViennaCL: Could not create kernel '" << name_ << "'." << std::endl;
 +          #endif
 +          //std::cerr << "Could not build kernel '" << name_ << "'." << std::endl;
 +        }
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +
 +      void set_work_size_defaults()
 +      {
 +        if (viennacl::ocl::current_device().type() == CL_DEVICE_TYPE_GPU)
 +        {
 +          local_work_size_[0] = 128; local_work_size_[1] = 0;
 +          global_work_size_[0] = 128*128; global_work_size_[1] = 0;
 +        }
 +        else //assume CPU type:
 +        {
 +          //conservative assumption: one thread per CPU core:
 +          local_work_size_[0] = 1; local_work_size_[1] = 0;
 +          global_work_size_[0] = viennacl::ocl::current_device().max_compute_units(); global_work_size_[1] = 0;
 +        }
 +      }
 +
 +      void init()
 +      {
 +        if (!init_done_)
 +        {
 +          create_kernel();
 +          init_done_ = true;
 +        }
 +      }
 +      
 +      viennacl::ocl::handle<cl_kernel> handle_;
 +      viennacl::ocl::handle<cl_program> program_;
 +      std::string name_;
 +      bool init_done_;
 +      size_type local_work_size_[2];
 +      size_type global_work_size_[2];
 +    };
 +    
 +  } //namespace ocl
 +} //namespace viennacl
 +
 +#endif
++=======
+ #ifndef VIENNACL_OCL_KERNEL_HPP_
+ #define VIENNACL_OCL_KERNEL_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/ocl/kernel.hpp
+     @brief Representation of an OpenCL kernel in ViennaCL.
+ */
+ 
+ #ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #else
+ #include <CL/cl.h>
+ #endif
+ 
+ #include "viennacl/ocl/forwards.h"
+ #include "viennacl/ocl/handle.hpp"
+ #include "viennacl/ocl/program.hpp"
+ #include "viennacl/ocl/device.hpp"
+ #include "viennacl/ocl/local_mem.hpp"
+ #include "viennacl/ocl/infos.hpp"
+ 
+ namespace viennacl
+ {
+   namespace ocl
+   {
+     /** @brief Helper class for packing four cl_uint numbers into a uint4 type for access inside an OpenCL kernel.
+       *
+       * Since the primary use is for dealing with ranges and strides, the four members are termed accordingly.
+       */
+     struct packed_cl_uint
+     {
+       /** @brief Starting value of the integer stride. */
+       cl_uint start;
+       /** @brief Increment between integers. */
+       cl_uint stride;
+       /** @brief Number of values in the stride. */
+       cl_uint size;
+       /** @brief Internal length of the buffer. Might be larger than 'size' due to padding. */
+       cl_uint internal_size;
+     };
+ 
+     /** @brief Represents an OpenCL kernel within ViennaCL */
+     class kernel
+     {
+       template <typename KernelType>
+       friend void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
+ 
+       template<cl_kernel_info param>
+       friend typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k);
+ 
+       template<cl_kernel_info param>
+       friend typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k, viennacl::ocl::device const & d);
+ 
+ 
+     public:
+       typedef vcl_size_t            size_type;
+ 
+       kernel() : handle_(), p_program_(NULL), p_context_(NULL), name_()
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Creating kernel object (default CTOR)" << std::endl;
+         #endif
+       }
+ 
+       kernel(cl_kernel kernel_handle, viennacl::ocl::program const & kernel_program, viennacl::ocl::context const & kernel_context, std::string const & name)
+         : handle_(kernel_handle, kernel_context), p_program_(&kernel_program), p_context_(&kernel_context), name_(name)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Creating kernel object (full CTOR)" << std::endl;
+         #endif
+         set_work_size_defaults();
+       }
+ 
+       kernel(kernel const & other)
+         : handle_(other.handle_), p_program_(other.p_program_), p_context_(other.p_context_), name_(other.name_)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Creating kernel object (Copy CTOR)" << std::endl;
+         #endif
+         local_work_size_[0] = other.local_work_size_[0];
+         local_work_size_[1] = other.local_work_size_[1];
+         local_work_size_[2] = other.local_work_size_[2];
+ 
+         global_work_size_[0] = other.global_work_size_[0];
+         global_work_size_[1] = other.global_work_size_[1];
+         global_work_size_[2] = other.global_work_size_[2];
+       }
+ 
+       viennacl::ocl::kernel & operator=(const kernel & other)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Assigning kernel object" << std::endl;
+         #endif
+         handle_ = other.handle_;
+         p_program_ = other.p_program_;
+         p_context_ = other.p_context_;
+         name_ = other.name_;
+         local_work_size_[0] = other.local_work_size_[0];
+         local_work_size_[1] = other.local_work_size_[1];
+         local_work_size_[2] = other.local_work_size_[2];
+         global_work_size_[0] = other.global_work_size_[0];
+         global_work_size_[1] = other.global_work_size_[1];
+         global_work_size_[2] = other.global_work_size_[2];
+         return *this;
+       }
+ 
+       /** @brief Sets a char argument at the provided position */
+       void arg(unsigned int pos, cl_char val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting char kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_char), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets a char argument at the provided position */
+       void arg(unsigned int pos, cl_uchar val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting unsigned char kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uchar), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets a argument of type short at the provided position */
+       void arg(unsigned int pos, cl_short val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting short kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_short), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets a argument of type unsigned short at the provided position */
+       void arg(unsigned int pos, cl_ushort val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting unsigned short kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_ushort), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+ 
+       /** @brief Sets an unsigned integer argument at the provided position */
+       void arg(unsigned int pos, cl_uint val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting unsigned int kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uint), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets four packed unsigned integers as argument at the provided position */
+       void arg(unsigned int pos, packed_cl_uint val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting packed_cl_uint kernel argument (" << val.start << ", " << val.stride << ", " << val.size << ", " << val.internal_size << ") at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(packed_cl_uint), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets a single precision floating point argument at the provided position */
+       void arg(unsigned int pos, float val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting floating point kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(float), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets a double precision floating point argument at the provided position */
+       void arg(unsigned int pos, double val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting double precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(double), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets an int argument at the provided position */
+       void arg(unsigned int pos, cl_int val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting int precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_int), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets an unsigned long argument at the provided position */
+       void arg(unsigned int pos, cl_ulong val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting ulong precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_ulong), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       /** @brief Sets an unsigned long argument at the provided position */
+       void arg(unsigned int pos, cl_long val)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting long precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_long), (void*)&val);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       //generic handling: call .handle() member
+       /** @brief Sets an OpenCL memory object at the provided position */
+       template<class VCL_TYPE>
+       void arg(unsigned int pos, VCL_TYPE const & val)
+       {
+         assert(&val.handle().opencl_handle().context() == &handle_.context() && bool("Kernel and memory object not in the same context!"));
+ 
+         cl_mem temp = val.handle().opencl_handle().get();
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting generic kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+       //forward handles directly:
+       /** @brief Sets an OpenCL object at the provided position */
+       template<class CL_TYPE>
+       void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
+       {
+         CL_TYPE temp = h.get();
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting handle kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+ 
+       //local buffer argument:
+       /** @brief Sets an OpenCL local memory object at the provided position */
+       void arg(unsigned int pos, const local_mem & mem)
+       {
+         cl_uint size = static_cast<cl_uint>(mem.size());
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting local memory kernel argument of size " << size << " bytes at pos " << pos << " for kernel " << name_ << std::endl;
+         #endif
+         cl_int err = clSetKernelArg(handle_.get(), pos, size, 0);
+         VIENNACL_ERR_CHECK(err);
+       }
+ 
+ 
+ 
+       /** @brief Convenience function for setting one kernel parameter */
+       template <typename T0>
+       kernel & operator()(T0 const & t0)
+       {
+          arg(0, t0);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting two kernel parameters */
+       template <typename T0, typename T1>
+       kernel & operator()(T0 const & t0, T1 const & t1)
+       {
+          arg(0, t0); arg(1, t1);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting three kernel parameters */
+       template <typename T0, typename T1, typename T2>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting four kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting five kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting six kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting seven kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting eight kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting nine kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting ten kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4,
+                 typename T5, typename T6, typename T7, typename T8, typename T9>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4,
+                           T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting eleven kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting twelve kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting thirteen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11, typename T12>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11, T12 const & t12)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11); arg(12, t12);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting fourteen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting fifteen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting sixteen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting seventeen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting eighteen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17)
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting nineteen kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting twenty kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting twentyone kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting twentytwo kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 23 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);  arg(22, t22);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 24 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 25 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 26 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 27 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25, typename T26>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25, T26 const & t26
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25); arg(26, t26);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 28 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25, typename T26, typename T27>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 29 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25, typename T26, typename T27, typename T28>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 30 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25, typename T26, typename T27, typename T28, typename T29>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 31 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
+                 typename T30>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
+                           T30 const & t30
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
+          arg(30, t30);
+          return *this;
+       }
+ 
+       /** @brief Convenience function for setting 32 kernel parameters */
+       template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                 typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                 typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                 typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                 typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
+                 typename T30, typename T31>
+       kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                           T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                           T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                           T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                           T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
+                           T30 const & t30, T31 const & t31
+                          )
+       {
+          arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+          arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+          arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+          arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+          arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
+          arg(30, t30); arg(31, t31);
+          return *this;
+       }
+ 
+ 
+ 
+ 
+       /** @brief Returns the local work size at the respective dimension
+       *
+       * @param index   Dimension index (currently either 0 or 1)
+       */
+       size_type local_work_size(int index = 0) const
+       {
+         assert(index < 3 && bool("Work size index out of bounds"));
+         return local_work_size_[index];
+       }
+       /** @brief Returns the global work size at the respective dimension
+       *
+       * @param index   Dimension index (currently either 0 or 1)
+       */
+       size_type global_work_size(int index = 0) const
+       {
+         assert(index < 3 && bool("Work size index out of bounds"));
+         return global_work_size_[index];
+       }
+ 
+       /** @brief Sets the local work size at the respective dimension
+       *
+       * @param index   Dimension index (currently either 0 or 1)
+       * @param s       The new local work size
+       */
+       void local_work_size(int index, size_type s)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting local work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
+         #endif
+         assert(index < 3 && bool("Work size index out of bounds"));
+         local_work_size_[index] = s;
+       }
+       /** @brief Sets the global work size at the respective dimension
+       *
+       * @param index   Dimension index (currently either 0 or 1)
+       * @param s       The new global work size
+       */
+       void global_work_size(int index, size_type s)
+       {
+         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+         std::cout << "ViennaCL: Setting global work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
+         #endif
+         assert(index < 3 && bool("Work size index out of bounds"));
+         global_work_size_[index] = s;
+       }
+ 
+       std::string const & name() const { return name_; }
+ 
+       viennacl::ocl::handle<cl_kernel> const & handle() const { return handle_; }
+ 
+       viennacl::ocl::context const & context() const { return *p_context_; }
+ 
+     private:
+ 
+       inline void set_work_size_defaults();    //see context.hpp for implementation
+ 
+       viennacl::ocl::handle<cl_kernel> handle_;
+       viennacl::ocl::program const * p_program_;
+       viennacl::ocl::context const * p_context_;
+       std::string name_;
+       size_type local_work_size_[3];
+       size_type global_work_size_[3];
+     };
+ 
+     /** @brief Queries information about a kernel
+   *
+   * @param k Corresponding kernel
+   */
+     template<cl_kernel_info param>
+     typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k)
+     {
+         typedef typename detail::return_type<cl_kernel, param>::Result res_t;
+         return detail::get_info_impl<res_t>()(k.handle_.get(),param);
+     }
+ 
+   /** @brief Queries information about the execution of a kernel on a particular device
+    *
+    * @param k Corresponding kernel
+    * @param d Corresponding device
+    */
+     template<cl_kernel_info param>
+     typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k, viennacl::ocl::device const & d)
+     {
+         typedef typename detail::return_type<cl_kernel, param>::Result res_t;
+         return detail::get_info_impl<res_t>()(k.handle_.get(),d.id(),param);
+     }
+ 
+   } //namespace ocl
+ } //namespace viennacl
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/ocl/platform.hpp
index 83c4e21,11a4708..2ea5576
--- a/viennacl/ocl/platform.hpp
+++ b/viennacl/ocl/platform.hpp
@@@ -35,11 -36,17 +36,21 @@@ namespace viennac
  {
    namespace ocl
    {
+ 
+     /** @brief Wrapper class for an OpenCL platform.
+       *
+       * This class was written when the OpenCL C++ bindings haven't been standardized yet.
+       * Regardless, it takes care about some additional details and is supposed to provide higher convenience.
+       */
      class platform
      {
-       
+ 
        public:
++<<<<<<< HEAD
 +        platform(std::size_t pf_index = 0)
++=======
+         platform(vcl_size_t pf_index = 0)
++>>>>>>> upstream/1.5.1
          {
            cl_int err;
            cl_uint num_platforms;
@@@ -49,11 -56,20 +60,26 @@@
            #endif
            err = clGetPlatformIDs(42, ids, &num_platforms);
            VIENNACL_ERR_CHECK(err);
++<<<<<<< HEAD
 +          assert(num_platforms > pf_index && "ViennaCL: ERROR: Not enough platforms found!");          
 +          id_ = ids[pf_index];
 +          assert(num_platforms > 0 && "ViennaCL: ERROR: No platform found!");          
++=======
+           assert(num_platforms > pf_index && bool("ViennaCL: ERROR: Not enough platforms found!"));
+           id_ = ids[pf_index];
+           assert(num_platforms > 0 && bool("ViennaCL: ERROR: No platform found!"));
+         }
+ 
+         platform(cl_platform_id pf_id) : id_(pf_id) {}
+ 
+         platform(platform const & other) : id_(other.id_) {}
+ 
+         void operator=(cl_platform_id pf_id)
+         {
+           id_ = pf_id;
++>>>>>>> upstream/1.5.1
          }
-         
+ 
          cl_platform_id id() const
          {
            return id_;
diff --cc viennacl/slice.hpp
index 361195b,3218745..094eec8
--- a/viennacl/slice.hpp
+++ b/viennacl/slice.hpp
@@@ -2,16 -2,17 +2,27 @@@
  #define VIENNACL_SLICE_HPP_
  
  /* =========================================================================
++<<<<<<< HEAD
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
++=======
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
++>>>>>>> upstream/1.5.1
  
                              -----------------
                    ViennaCL - The Vienna Computing Library
                              -----------------
  
     Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
++<<<<<<< HEAD
 +               
++=======
+ 
++>>>>>>> upstream/1.5.1
     (A list of authors and contributors can be found in the PDF manual)
  
     License:         MIT (X11), see file LICENSE in the base directory
@@@ -30,7 -31,7 +41,11 @@@ namespace viennac
  {
  
    /** @brief A slice class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
++<<<<<<< HEAD
 +   * 
++=======
+    *
++>>>>>>> upstream/1.5.1
     * Similar to the boost::numeric::ublas::basic_range class.
     */
    template <typename SizeType /* see forwards.h for default argument*/,
@@@ -43,34 -44,34 +58,63 @@@
        typedef size_type            value_type;
        typedef value_type           const_reference;
        typedef const_reference      reference;
++<<<<<<< HEAD
 +      
++=======
+ 
++>>>>>>> upstream/1.5.1
        basic_slice() : start_(0), stride_(1), size_(0) {}
        basic_slice(size_type start_index,
                    difference_type stride_arg,
                    size_type size_arg) : start_(start_index), stride_(stride_arg), size_(size_arg) {}
++<<<<<<< HEAD
 +        
 +        
 +      size_type       start() const { return start_; }
 +      difference_type stride() const { return stride_; }
 +      size_type       size() const { return size_; }
 +      
 +      const_reference operator()(size_type i) const 
++=======
+ 
+ 
+       size_type       start() const { return start_; }
+       difference_type stride() const { return stride_; }
+       size_type       size() const { return size_; }
+ 
+       const_reference operator()(size_type i) const
++>>>>>>> upstream/1.5.1
        {
          assert(i < size());
          return start_ + i * stride_;
        }
        const_reference operator[](size_type i) const { return operator()(i); }
++<<<<<<< HEAD
 +      
 +      bool operator==(const basic_slice & s) const { return (start_ == s.start_) && (stride_ == s.stride_) && (size_ == s.size_); }
 +      bool operator!=(const basic_slice & s) const { return !(*this == s); }
 +      
++=======
+ 
+       bool operator==(const basic_slice & s) const { return (start_ == s.start_) && (stride_ == s.stride_) && (size_ == s.size_); }
+       bool operator!=(const basic_slice & s) const { return !(*this == s); }
+ 
++>>>>>>> upstream/1.5.1
      private:
        size_type start_;
        difference_type stride_;
        size_type size_;
    };
++<<<<<<< HEAD
 +  
 +  
 +}
 +
- #endif
++#endif
++=======
+ 
+ 
+ }
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/toeplitz_matrix.hpp
index 9e7c9c5,af7adc5..e48c511
--- a/viennacl/toeplitz_matrix.hpp
+++ b/viennacl/toeplitz_matrix.hpp
@@@ -262,9 -266,9 +266,13 @@@ namespace viennacl 
          copy(gpu_matrix, tmp);
          s << "[" << size << "," << size << "](";
  
-         for(std::size_t i = 0; i < size; i++) {
+         for(vcl_size_t i = 0; i < size; i++) {
              s << "(";
++<<<<<<< HEAD
 +            for(std::size_t j = 0; j < size; j++) {
++=======
+             for(vcl_size_t j = 0; j < size; j++) {
++>>>>>>> upstream/1.5.1
                  s << tmp[static_cast<int>(j) - static_cast<int>(i) + static_cast<int>(size - 1)];
                  //s << (int)i - (int)j;
                  if(j < (size - 1)) s << ",";
diff --cc viennacl/tools/adapter.hpp
index 2dcc182,0b753ea..8e2947e
--- a/viennacl/tools/adapter.hpp
+++ b/viennacl/tools/adapter.hpp
@@@ -131,19 -132,17 +132,24 @@@ namespace viennac
            }
            return *this;
          }
-         
+ 
          bool operator==(self_type const & other) const
          {
-           if (is_iterator1)
-             return (i_ == other.i_);
-           return (iter2 == other.iter2);
+           return is_iterator1 ? (i_ == other.i_) : (iter2 == other.iter2);
          }
-         
+ 
          bool operator!=(self_type const & other) const { return !(*this == other); }
++<<<<<<< HEAD
 +        
 +        size_type index1() const { return i_; }
 +        size_type index2() const
 +        { 
++=======
+ 
+         size_type index1() const { return i_; }
+         size_type index2() const
+         {
++>>>>>>> upstream/1.5.1
            if (is_iterator1)
              return 0;
            else
@@@ -308,10 -306,10 +313,17 @@@
            return (iter2 == other.iter2);
          }
          bool operator!=(self_type const & other) const { return !(*this == other); }
++<<<<<<< HEAD
 +        
 +        size_type index1() const { return i_; }
 +        size_type index2() const
 +        { 
++=======
+ 
+         size_type index1() const { return i_; }
+         size_type index2() const
+         {
++>>>>>>> upstream/1.5.1
            if (is_iterator1)
              return 0;
            else
diff --cc viennacl/tools/matrix_size_deducer.hpp
index ccdd043,b15dc1b..7e011a2
--- a/viennacl/tools/matrix_size_deducer.hpp
+++ b/viennacl/tools/matrix_size_deducer.hpp
@@@ -45,130 -47,167 +47,211 @@@ namespace viennac
      struct MATRIX_SIZE_DEDUCER
      {
        //Standard case: size1 from lhs, size2 from rhs (fits most cases)
-       static size_t size1(LHS & lhs, RHS & rhs) { return lhs.size1(); }
-       static size_t size2(LHS & lhs, RHS & rhs) { return rhs.size2(); }
+       static vcl_size_t size1(LHS & lhs, RHS & /*rhs*/) { return lhs.size1(); }
+       static vcl_size_t size2(LHS & /*lhs*/, RHS & rhs) { return rhs.size2(); }
      };
-     
+ 
+     /** \cond */
      //special case: outer vector product:
-     template <typename ScalarType, unsigned int A1, unsigned int A2>
-     struct MATRIX_SIZE_DEDUCER<viennacl::vector<ScalarType, A1>,
-                                viennacl::vector<ScalarType, A2>,
+     template <typename ScalarType>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::vector_base<ScalarType>,
+                                const viennacl::vector_base<ScalarType>,
                                 viennacl::op_prod>
      {
-       static size_t size1(viennacl::vector<ScalarType, A1> & lhs,
-                           viennacl::vector<ScalarType, A2> & rhs) { return lhs.size1(); }
+       static vcl_size_t size1(viennacl::vector_base<ScalarType> const & lhs,
+                                viennacl::vector_base<ScalarType> const & /*rhs*/) { return lhs.size(); }
  
-       static size_t size2(viennacl::vector<ScalarType, A1> & lhs,
-                           viennacl::vector<ScalarType, A2> & rhs) { return rhs.size2(); }
+       static vcl_size_t size2(viennacl::vector_base<ScalarType> const & /*lhs*/,
+                                viennacl::vector_base<ScalarType> const & rhs) { return rhs.size(); }
      };
  
-     //special case: transposed matrix-Something product: Return the number of rows of the matrix
-     /*template <typename MatrixType, typename ScalarType, unsigned int A>
-     struct MATRIX_SIZE_DEDUCER<MatrixType, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
+ 
+     //special case: multiplication with a scalar
+     template <typename LHS, typename RHS, typename OP, typename ScalarType>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<const LHS, const RHS, OP>,
+                                const ScalarType,
+                                viennacl::op_mult>
      {
-       static unsigned int size(MatrixType & lhs, const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-     };*/
+       static vcl_size_t size1(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                                ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size1(lhs.lhs(), lhs.rhs()); }
  
-     // A^T * B
-     template <typename ScalarType, typename T1, typename F2, unsigned int A2>
-     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
-                                                                  T1, op_trans>,
-                                const viennacl::matrix<ScalarType, F2, A2>,
-                                viennacl::op_prod>
+       static vcl_size_t size2(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                                ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size2(lhs.lhs(), lhs.rhs()); }
+     };
+ 
+     //special case: multiplication with a scalar
+     template <typename T, typename F, typename ScalarType>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<T, F>,
+                                const ScalarType,
+                                viennacl::op_mult>
      {
-       static std::size_t size1(viennacl::matrix_expression<T1,
-                                                            T1,
-                                                            op_trans> const & lhs,
-                                viennacl::matrix<ScalarType, F2, A2> const & rhs) { return lhs.lhs().size2(); }
-       static std::size_t size2(viennacl::matrix_expression<T1,
-                                                            T1,
-                                                            op_trans> const & lhs,
-                                viennacl::matrix<ScalarType, F2, A2> const & rhs) { return rhs.size2(); }
+       static vcl_size_t size1(viennacl::matrix_base<T, F> const & lhs,
+                                ScalarType const & /*rhs*/) { return lhs.size1(); }
+ 
+       static vcl_size_t size2(viennacl::matrix_base<T, F> const & lhs,
+                                ScalarType const & /*rhs*/) { return lhs.size2(); }
      };
  
-     template <typename T1, typename MatrixType2>
+ 
+     //special case: division with a scalar
+     template <typename LHS, typename RHS, typename OP, typename ScalarType>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<const LHS, const RHS, OP>,
+                                const ScalarType,
+                                viennacl::op_div>
+     {
+       static vcl_size_t size1(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                                ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size1(lhs.lhs(), lhs.rhs()); }
+ 
+       static vcl_size_t size2(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                                ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size2(lhs.lhs(), lhs.rhs()); }
+     };
+ 
+     //special case: division with a scalar
+     template <typename T, typename F, typename ScalarType>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<T, F>,
+                                const ScalarType,
+                                viennacl::op_div>
+     {
+       static vcl_size_t size1(viennacl::matrix_base<T, F> const & lhs,
+                                ScalarType const & /*rhs*/) { return lhs.size1(); }
+ 
+       static vcl_size_t size2(viennacl::matrix_base<T, F> const & lhs,
+                                ScalarType const & /*rhs*/) { return lhs.size2(); }
+     };
+ 
+     //special case: diagonal from vector
+     template <typename T>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::vector_base<T>,
+                                const int,
+                                viennacl::op_vector_diag>
+     {
+       static vcl_size_t size1(viennacl::vector_base<T> const & lhs,
+                                const int k) { return lhs.size() + static_cast<vcl_size_t>(std::fabs(double(k))); }
+ 
+       static vcl_size_t size2(viennacl::vector_base<T> const & lhs,
+                                const int k) { return lhs.size() + static_cast<vcl_size_t>(std::fabs(double(k))); }
+     };
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+     //special case: transposed matrix-vector product: Return the number of rows of the matrix
+     template <typename MatrixType>
+     struct MATRIX_SIZE_DEDUCER<MatrixType,
+                                MatrixType,
+                                viennacl::op_trans>
+     {
+       static vcl_size_t size1(const MatrixType & lhs,
+                                const MatrixType & /*rhs*/) { return lhs.size2(); }
+       static vcl_size_t size2(const MatrixType & lhs,
+                                const MatrixType & /*rhs*/) { return lhs.size1(); }
+     };
+ 
+     // A^T * B
+     template <typename ScalarType, typename T1, typename F2>
      struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
                                                                   T1, op_trans>,
-                                const viennacl::matrix_range<MatrixType2>,
-                                viennacl::op_prod>
+                                const viennacl::matrix_base<ScalarType, F2>,
+                                viennacl::op_mat_mat_prod>
      {
-       static std::size_t size1(viennacl::matrix_expression<T1,
+       static vcl_size_t size1(viennacl::matrix_expression<T1,
                                                             T1,
                                                             op_trans> const & lhs,
-                                viennacl::matrix_range<MatrixType2> const & rhs) { return lhs.lhs().size2(); }
-       static std::size_t size2(viennacl::matrix_expression<T1,
+                                viennacl::matrix_base<ScalarType, F2> const & /*rhs*/) { return lhs.lhs().size2(); }
+       static vcl_size_t size2(viennacl::matrix_expression<T1,
                                                             T1,
-                                                            op_trans> const & lhs,
-                                viennacl::matrix_range<MatrixType2> const & rhs) { return rhs.size2(); }
+                                                            op_trans> const & /*lhs*/,
+                                viennacl::matrix_base<ScalarType, F2> const & rhs) { return rhs.size2(); }
      };
  
++<<<<<<< HEAD
 +    template <typename T1, typename MatrixType2>
 +    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
 +                                                                 T1, op_trans>,
 +                               const viennacl::matrix_slice<MatrixType2>,
 +                               viennacl::op_prod>
 +    {
 +      static std::size_t size1(viennacl::matrix_expression<T1,
 +                                                           T1,
 +                                                           op_trans> const & lhs,
 +                               viennacl::matrix_slice<MatrixType2> const & rhs) { return lhs.lhs().size2(); }
 +      static std::size_t size2(viennacl::matrix_expression<T1,
 +                                                           T1,
 +                                                           op_trans> const & lhs,
 +                               viennacl::matrix_slice<MatrixType2> const & rhs) { return rhs.size2(); }
 +    };
 +    
 +    
 +    // A * B^T 
 +    
 +    template <typename ScalarType, typename F1, unsigned int A1, typename T2>
 +    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F1, A1>,
++=======
+ 
+     // A * B^T
+ 
+     template <typename ScalarType, typename F1, typename T2>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<ScalarType, F1>,
++>>>>>>> upstream/1.5.1
                                 const viennacl::matrix_expression<T2,
                                                                   T2, op_trans>,
-                                viennacl::op_prod>
+                                viennacl::op_mat_mat_prod>
      {
-       static std::size_t size1(viennacl::matrix<ScalarType, F1, A1> const & lhs,
+       static vcl_size_t size1(viennacl::matrix_base<ScalarType, F1> const & lhs,
                                 viennacl::matrix_expression<T2,
                                                             T2,
-                                                            op_trans> const & rhs) { return lhs.size1(); }
-       static std::size_t size2(viennacl::matrix<ScalarType, F1, A1> const & lhs,
+                                                            op_trans> const & /*rhs*/) { return lhs.size1(); }
+       static vcl_size_t size2(viennacl::matrix_base<ScalarType, F1> const & /*lhs*/,
                                 viennacl::matrix_expression<T2,
                                                             T2,
                                                             op_trans> const & rhs) { return rhs.lhs().size1(); }
      };
  
-     template <typename MatrixType1, typename T2>
-     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_range<MatrixType1>,
+ 
+ 
+ 
+     // A^T * B^T
+ 
+     template <typename T1, typename T2>
+     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
+                                                                  T1, op_trans>,
                                 const viennacl::matrix_expression<T2,
                                                                   T2, op_trans>,
-                                viennacl::op_prod>
+                                viennacl::op_mat_mat_prod>
      {
-       static std::size_t size1(viennacl::matrix_range<MatrixType1> const & lhs,
-                                viennacl::matrix_expression<T2,
-                                                            T2,
-                                                            op_trans> const & rhs) { return lhs.size1(); }
-       static std::size_t size2(viennacl::matrix_range<MatrixType1> const & lhs,
-                                viennacl::matrix_expression<T2,
-                                                            T2,
-                                                            op_trans> const & rhs) { return rhs.lhs().size1(); }
+       typedef viennacl::matrix_expression<T1, T1, op_trans>   LHSType;
+       typedef viennacl::matrix_expression<T2, T2, op_trans>   RHSType;
+ 
+       static vcl_size_t size1(LHSType const & lhs,
+                                RHSType const & /*rhs*/) { return lhs.lhs().size2(); }
+       static vcl_size_t size2(LHSType const & /*lhs*/,
+                                RHSType const & rhs) { return rhs.lhs().size1(); }
      };
++<<<<<<< HEAD
 +
 +    template <typename MatrixType1, typename T2>
 +    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_slice<MatrixType1>,
 +                               const viennacl::matrix_expression<T2,
 +                                                                 T2, op_trans>,
 +                               viennacl::op_prod>
 +    {
 +      static std::size_t size1(viennacl::matrix_slice<MatrixType1> const & lhs,
 +                               viennacl::matrix_expression<T2,
 +                                                           T2,
 +                                                           op_trans> const & rhs) { return lhs.size1(); }
 +      static std::size_t size2(viennacl::matrix_slice<MatrixType1> const & lhs,
 +                               viennacl::matrix_expression<T2,
 +                                                           T2,
 +                                                           op_trans> const & rhs) { return rhs.lhs().size1(); }
 +    };
 +    
++=======
+     /** \endcond */
++>>>>>>> upstream/1.5.1
    }
  }
  
diff --cc viennacl/tools/tools.hpp
index dec2942,028d3aa..5044ead
--- a/viennacl/tools/tools.hpp
+++ b/viennacl/tools/tools.hpp
@@@ -1,409 -1,289 +1,701 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_TOOLS_TOOLS_HPP_
 +#define VIENNACL_TOOLS_TOOLS_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file tools.hpp
 +    @brief Various little tools used here and there in ViennaCL.
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +#include "viennacl/tools/adapter.hpp"
 +
 +
 +#ifdef VIENNACL_HAVE_UBLAS  
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/matrix.hpp>
 +#endif
 +
 +#ifdef VIENNACL_HAVE_EIGEN  
 +#include <Eigen/Core>
 +#include <Eigen/Sparse>
 +#endif
 +
 +#ifdef VIENNACL_HAVE_MTL4
 +#include <boost/numeric/mtl/mtl.hpp>
 +#endif
 +
 +#include <vector>
 +#include <map>
 +
 +namespace viennacl
 +{
 +  namespace tools
 +  {
 +    
 +    /** @brief Supply suitable increment functions for the iterators: */
 +    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    struct MATRIX_ITERATOR_INCREMENTER<viennacl::row_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
 +    {
 +      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, unsigned int & row, unsigned int & col) { ++row; }
 +    };
 +
 +    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
 +    struct MATRIX_ITERATOR_INCREMENTER<viennacl::col_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
 +    {
 +      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, unsigned int & row, unsigned int & col) { ++col; }
 +    };
 +
 +    
 +    /** @brief A guard that checks whether the floating point type of GPU types is either float or double */
 +    template <typename T>
 +    struct CHECK_SCALAR_TEMPLATE_ARGUMENT
 +    {
 +        typedef typename T::ERROR_SCALAR_MUST_HAVE_TEMPLATE_ARGUMENT_FLOAT_OR_DOUBLE  ResultType;
 +    };
 +    
 +    template <>
 +    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<float>
 +    {
 +        typedef float  ResultType;
 +    };
 +    
 +    template <>
 +    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<double>
 +    {
 +        typedef double  ResultType;
 +    };
 +
 +    
 +    
 +    /** @brief Reads a text from a file into a std::string
 +    *
 +    * @param filename   The filename
 +    * @return The text read from the file
 +    */
 +    inline std::string readTextFromFile(const std::string & filename)
 +    {
 +      std::ifstream f(filename.c_str());
 +      if (!f) return std::string();
 +
 +      std::stringstream result;
 +      std::string tmp;
 +      while (std::getline(f, tmp))
 +        result << tmp << std::endl;
 +
 +      return result.str();
 +    }
 +
 +    /** @brief Replaces all occurances of a substring by another stringstream
 +    *
 +    * @param text   The string to search in
 +    * @param to_search  The substring to search for
 +    * @param to_replace The replacement for found substrings
 +    * @return The resulting string
 +    */
 +    inline std::string strReplace(const std::string & text, std::string to_search, std::string to_replace)
 +    {
 +      std::string::size_type pos = 0;
 +      std::string result;
 +      std::string::size_type found;
 +      while( (found = text.find(to_search, pos)) != std::string::npos )
 +      {
 +        result.append(text.substr(pos,found-pos));
 +        result.append(to_replace);
 +        pos = found + to_search.length();
 +      }
 +      if (pos < text.length())
 +        result.append(text.substr(pos));
 +      return result;
 +    }
 +
 +    /** @brief Rounds an integer to the next multiple of another integer
 +    *
 +    * @tparam INT_TYPE  The integer type
 +    * @param to_reach   The integer to be rounded up (ceil operation)
 +    * @param base       The base
 +    * @return The smallest multiple of 'base' such that to_reach <= base
 +    */
 +    template <class INT_TYPE>
 +    INT_TYPE roundUpToNextMultiple(INT_TYPE to_reach, INT_TYPE base)
 +    {
 +      if (to_reach % base == 0) return to_reach;
 +      return ((to_reach / base) + 1) * base;
 +    }
 +    
 +    
 +    /** @brief Create a double precision kernel out of a single precision kernel
 +    *
 +    * @param source          The source string
 +    * @param fp_extension    An info string that specifies the OpenCL double precision extension
 +    * @return   The double precision kernel
 +    */
 +    inline std::string make_double_kernel(std::string const & source, std::string const & fp_extension)
 +    {
 +      std::stringstream ss;
 +      ss << "#pragma OPENCL EXTENSION " << fp_extension << " : enable\n\n";
 +      
 +      std::string result = ss.str();
 +      result.append(strReplace(source, "float", "double"));
 +      return result;
 +    }
 +    
 +    
 +    /** @brief Removes the const qualifier from a type */
 +    template <typename T>
 +    struct CONST_REMOVER
 +    {
 +      typedef T   ResultType;
 +    };
 +
 +    template <typename T>
 +    struct CONST_REMOVER<const T>
 +    {
 +      typedef T   ResultType;
 +    };
 +
 +
 +    /** @brief Extracts the vector type from one of the two arguments. Used for the vector_expression type.
 +    *
 +    * @tparam LHS   The left hand side operand of the vector_expression
 +    * @tparam RHS   The right hand side operand of the vector_expression
 +    */
 +    template <typename LHS, typename RHS>
 +    struct VECTOR_EXTRACTOR_IMPL
 +    {
 +      typedef typename LHS::ERROR_COULD_NOT_EXTRACT_VECTOR_INFORMATION_FROM_VECTOR_EXPRESSION  ResultType;
 +    };
 +    
 +    template <typename LHS, typename ScalarType, unsigned int A>
 +    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector<ScalarType, A> >
 +    {
 +      typedef viennacl::vector<ScalarType, A>   ResultType;
 +    };
 +
 +    template <typename LHS, typename VectorType>
 +    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector_range<VectorType> >
 +    {
 +      typedef VectorType   ResultType;
 +    };
 +
 +    template <typename LHS, typename VectorType>
 +    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector_slice<VectorType> >
 +    {
 +      typedef VectorType   ResultType;
 +    };
 +
 +    
 +    template <typename RHS, typename ScalarType, unsigned int A>
 +    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector<ScalarType, A>, RHS>
 +    {
 +      typedef viennacl::vector<ScalarType, A>   ResultType;
 +    };
 +
 +    template <typename VectorType, typename RHS>
 +    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_range<VectorType>, RHS>
 +    {
 +      typedef VectorType   ResultType;
 +    };
 +
 +    template <typename VectorType, typename RHS>
 +    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_slice<VectorType>, RHS>
 +    {
 +      typedef VectorType   ResultType;
 +    };
 +    
 +    //resolve ambiguities for previous cases:
 +    template <typename ScalarType, unsigned int A>
 +    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector<ScalarType, A>, viennacl::vector<ScalarType, A> >
 +    {
 +      typedef viennacl::vector<ScalarType, A>   ResultType;
 +    };
 +
 +    template <typename VectorType>
 +    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_range<VectorType>, viennacl::vector_range<VectorType> >
 +    {
 +      typedef VectorType   ResultType;
 +    };
 +    
 +    template <typename VectorType>
 +    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_slice<VectorType>, viennacl::vector_slice<VectorType> >
 +    {
 +      typedef VectorType   ResultType;
 +    };
 +    
 +    
 +    template <typename LHS, typename RHS>
 +    struct VECTOR_EXTRACTOR
 +    {
 +      typedef typename VECTOR_EXTRACTOR_IMPL<typename CONST_REMOVER<LHS>::ResultType,
 +                                              typename CONST_REMOVER<RHS>::ResultType>::ResultType      ResultType;
 +    };
 +
 +    /** @brief Deduces the size of the resulting vector represented by a vector_expression from the operands
 +    *
 +    * @tparam LHS   The left hand side operand
 +    * @tparam RHS   The right hand side operand
 +    * @tparam OP    The operation tag
 +    */
 +    template <typename LHS, typename RHS, typename OP>
 +    struct VECTOR_SIZE_DEDUCER
 +    {
 +      //take care: using a plain, naive .size() on the left hand side type can cause subtle side-effects!
 +    };
 +
 +    
 +    template <typename ScalarType, unsigned int A, typename RHS>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_add>
 +    {
 +      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
 +                         const RHS & rhs) { return lhs.size(); }
 +    };
 +
 +    template <typename ScalarType, unsigned int A, typename RHS>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_sub>
 +    {
 +      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
 +                         const RHS & rhs) { return lhs.size(); }
 +    };
 +    
 +    
 +   
 +    //Standard case: LHS is the vector type and carries the correct size
 +    template <typename ScalarType, unsigned int A, typename RHS>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
 +                         const RHS & rhs) { return lhs.size(); }
 +    };
 +
 +    template <typename ScalarType, unsigned int A, typename RHS>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_div>
 +    {
 +      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
 +                         const RHS & rhs) { return lhs.size(); }
 +    };
 +    
 +    //special case: matrix-vector product: Return the number of rows of the matrix
 +    template <typename ScalarType, typename F, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::matrix<ScalarType, F, Amat> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
 +    };
 +
 +    template <typename ScalarType, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::circulant_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::circulant_matrix<ScalarType, Amat> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
 +    };
 +    
 +    template <typename ScalarType, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::compressed_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::compressed_matrix<ScalarType, Amat> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
 +    };
 +
 +    template <typename ScalarType, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::coordinate_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::coordinate_matrix<ScalarType, Amat> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
 +    };
 +
 +    template <typename ScalarType, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::ell_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::ell_matrix<ScalarType, Amat> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
 +    };
 +
 +    template <typename ScalarType, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::hyb_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::hyb_matrix<ScalarType, Amat> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
 +    };
 +    
 +    
 +    //special case: transposed matrix-vector product: Return the number of cols(!) of the matrix
 +    template <typename ScalarType, typename F, unsigned int Amat, unsigned int A>
 +    struct VECTOR_SIZE_DEDUCER<const viennacl::matrix_expression< const viennacl::matrix<ScalarType, F, Amat>,
 +                                                                  const viennacl::matrix<ScalarType, F, Amat>,
 +                                                                  op_trans>,
 +                               const viennacl::vector<ScalarType, A>,
 +                               viennacl::op_prod>
 +    {
 +      static size_t size(const viennacl::matrix_expression< const viennacl::matrix<ScalarType, F, Amat>,
 +                                                            const viennacl::matrix<ScalarType, F, Amat>,
 +                                                            op_trans> & lhs,
 +                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.lhs().size2(); }
 +    };
 +
 +    
 +    
 +    
 +    
 +    /** @brief Obtain the cpu scalar type from a type, including a GPU type like viennacl::scalar<T>
 +    *
 +    * @tparam T   Either a CPU scalar type or a GPU scalar type
 +    */
 +    template <typename T>
 +    struct CPU_SCALAR_TYPE_DEDUCER
 +    {
 +      //force compiler error if type cannot be deduced
 +      //typedef T       ResultType;
 +    };
 +
 +    template <>
 +    struct CPU_SCALAR_TYPE_DEDUCER< float >
 +    {
 +      typedef float       ResultType;
 +    };
 +
 +    template <>
 +    struct CPU_SCALAR_TYPE_DEDUCER< double >
 +    {
 +      typedef double       ResultType;
 +    };
 +    
 +    template <typename T>
 +    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::scalar<T> >
 +    {
 +      typedef T       ResultType;
 +    };
 +
 +    template <typename T, unsigned int A>
 +    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::vector<T, A> >
 +    {
 +      typedef T       ResultType;
 +    };
 +
 +    template <typename T, typename F, unsigned int A>
 +    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix<T, F, A> >
 +    {
 +      typedef T       ResultType;
 +    };
 +
 +    
 +    template <typename T, typename F, unsigned int A>
 +    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix_expression<const matrix<T, F, A>, const matrix<T, F, A>, op_trans> >
 +    {
 +      typedef T       ResultType;
 +    };
 +
 +        
 +  } //namespace tools
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_TOOLS_TOOLS_HPP_
+ #define VIENNACL_TOOLS_TOOLS_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/tools/tools.hpp
+     @brief Various little tools used here and there in ViennaCL.
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ #include "viennacl/tools/adapter.hpp"
+ 
+ #include <vector>
+ #include <map>
+ 
+ namespace viennacl
+ {
+   namespace tools
+   {
+ 
+     /** \cond */
+     /** @brief Supply suitable increment functions for the iterators: */
+     template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
+     struct MATRIX_ITERATOR_INCREMENTER<viennacl::row_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
+     {
+       static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & /*mat*/, unsigned int & row, unsigned int & /*col*/) { ++row; }
+     };
+ 
+     template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
+     struct MATRIX_ITERATOR_INCREMENTER<viennacl::col_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
+     {
+       static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & /*mat*/, unsigned int & /*row*/, unsigned int & col) { ++col; }
+     };
+     /** \endcond */
+ 
+ 
+     /** @brief A guard that checks whether the floating point type of GPU types is either float or double */
+     template <typename T>
+     struct CHECK_SCALAR_TEMPLATE_ARGUMENT
+     {
+         typedef typename T::ERROR_SCALAR_MUST_HAVE_TEMPLATE_ARGUMENT_FLOAT_OR_DOUBLE  ResultType;
+     };
+ 
+     /** \cond */
+     template <>
+     struct CHECK_SCALAR_TEMPLATE_ARGUMENT<float>
+     {
+         typedef float  ResultType;
+     };
+ 
+     template <>
+     struct CHECK_SCALAR_TEMPLATE_ARGUMENT<double>
+     {
+         typedef double  ResultType;
+     };
+     /** \endcond */
+ 
+ 
+ 
+     /** @brief Reads a text from a file into a std::string
+     *
+     * @param filename   The filename
+     * @return The text read from the file
+     */
+     inline std::string readTextFromFile(const std::string & filename)
+     {
+       std::ifstream f(filename.c_str());
+       if (!f) return std::string();
+ 
+       std::stringstream result;
+       std::string tmp;
+       while (std::getline(f, tmp))
+         result << tmp << std::endl;
+ 
+       return result.str();
+     }
+ 
+     /** @brief Replaces all occurances of a substring by another stringstream
+     *
+     * @param text   The string to search in
+     * @param to_search  The substring to search for
+     * @param to_replace The replacement for found substrings
+     * @return The resulting string
+     */
+     inline std::string strReplace(const std::string & text, std::string to_search, std::string to_replace)
+     {
+       std::string::size_type pos = 0;
+       std::string result;
+       std::string::size_type found;
+       while( (found = text.find(to_search, pos)) != std::string::npos )
+       {
+         result.append(text.substr(pos,found-pos));
+         result.append(to_replace);
+         pos = found + to_search.length();
+       }
+       if (pos < text.length())
+         result.append(text.substr(pos));
+       return result;
+     }
+ 
+     /** @brief Rounds an integer to the next multiple of another integer
+     *
+     * @tparam INT_TYPE  The integer type
+     * @param to_reach   The integer to be rounded up (ceil operation)
+     * @param base       The base
+     * @return The smallest multiple of 'base' such that to_reach <= base
+     */
+     template <class INT_TYPE>
+     INT_TYPE align_to_multiple(INT_TYPE to_reach, INT_TYPE base)
+     {
+       if (to_reach % base == 0) return to_reach;
+       return ((to_reach / base) + 1) * base;
+     }
+ 
+ 
+     /** @brief Rounds an integer to the previous multiple of another integer
+     *
+     * @tparam INT_TYPE  The integer type
+     * @param to_reach   The integer to be rounded down (floor operation)
+     * @param base       The base
+     * @return The biggest multiple of 'base' such that to_reach >= base
+     */
+     template <class INT_TYPE>
+     INT_TYPE roundDownToPreviousMultiple(INT_TYPE to_reach, INT_TYPE base)
+     {
+       if (to_reach % base == 0) return to_reach;
+       return (to_reach / base) * base;
+     }
+ 
+     /** @brief Replace in a source string a pattern by another
+      *
+      * @param source The source string
+      * @param find String to find
+      * @param replace String to replace
+      */
+     int inline find_and_replace(std::string & source, std::string const & find, std::string const & replace)
+     {
+         int num=0;
+         vcl_size_t fLen = find.size();
+         vcl_size_t rLen = replace.size();
+         for (vcl_size_t pos=0; (pos=source.find(find, pos))!=std::string::npos; pos+=rLen)
+         {
+             num++;
+             source.replace(pos, fLen, replace);
+         }
+         return num;
+     }
+ 
+     /** @brief Create a double precision kernel out of a single precision kernel
+     *
+     * @param source          The source string
+     * @param fp_extension    An info string that specifies the OpenCL double precision extension
+     * @return   The double precision kernel
+     */
+     inline std::string make_double_kernel(std::string const & source, std::string const & fp_extension)
+     {
+       std::stringstream ss;
+       ss << "#pragma OPENCL EXTENSION " << fp_extension << " : enable\n\n";
+ 
+       std::string result = ss.str();
+       result.append(strReplace(source, "float", "double"));
+       return result;
+     }
+ 
+ 
+     /** @brief Removes the const qualifier from a type */
+     template <typename T>
+     struct CONST_REMOVER
+     {
+       typedef T   ResultType;
+     };
+ 
+     /** \cond */
+     template <typename T>
+     struct CONST_REMOVER<const T>
+     {
+       typedef T   ResultType;
+     };
+     /** \endcond */
+ 
+ 
+     /////// CPU scalar type deducer ///////////
+ 
+     /** @brief Obtain the cpu scalar type from a type, including a GPU type like viennacl::scalar<T>
+     *
+     * @tparam T   Either a CPU scalar type or a GPU scalar type
+     */
+     template <typename T>
+     struct CPU_SCALAR_TYPE_DEDUCER
+     {
+       //force compiler error if type cannot be deduced
+       //typedef T       ResultType;
+     };
+ 
+     /** \cond */
+     template <>
+     struct CPU_SCALAR_TYPE_DEDUCER< float >
+     {
+       typedef float       ResultType;
+     };
+ 
+     template <>
+     struct CPU_SCALAR_TYPE_DEDUCER< double >
+     {
+       typedef double       ResultType;
+     };
+ 
+     template <typename T>
+     struct CPU_SCALAR_TYPE_DEDUCER< viennacl::scalar<T> >
+     {
+       typedef T       ResultType;
+     };
+ 
+     template <typename T, unsigned int A>
+     struct CPU_SCALAR_TYPE_DEDUCER< viennacl::vector<T, A> >
+     {
+       typedef T       ResultType;
+     };
+ 
+     template <typename T, typename F, unsigned int A>
+     struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix<T, F, A> >
+     {
+       typedef T       ResultType;
+     };
+ 
+ 
+     template <typename T, typename F, unsigned int A>
+     struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix_expression<const matrix<T, F, A>, const matrix<T, F, A>, op_trans> >
+     {
+       typedef T       ResultType;
+     };
+     /** \endcond */
+ 
+     //
+     // Converts a scalar type when necessary unless it is a viennacl::scalar<> (typical use-case: convert user-provided floats to double (and vice versa) for OpenCL kernels)
+     //
+ 
+     template <typename HostScalarType>
+     viennacl::scalar<HostScalarType> const & promote_if_host_scalar(viennacl::scalar<HostScalarType> const & s) { return s; }
+ 
+     template <typename HostScalarType>
+     viennacl::scalar_expression<const viennacl::scalar<HostScalarType>,
+                                 const viennacl::scalar<HostScalarType>,
+                                 viennacl::op_flip_sign> const &
+     promote_if_host_scalar(viennacl::scalar_expression<const viennacl::scalar<HostScalarType>,
+                                                        const viennacl::scalar<HostScalarType>,
+                                                        viennacl::op_flip_sign> const & s) { return s; }
+ 
+     template <typename HostScalarType>
+     HostScalarType promote_if_host_scalar(float s) { return s; }
+ 
+     template <typename HostScalarType>
+     HostScalarType promote_if_host_scalar(double s) { return s; }
+ 
+     template <typename HostScalarType>
+     HostScalarType promote_if_host_scalar(long s) { return s; }
+ 
+     template <typename HostScalarType>
+     HostScalarType promote_if_host_scalar(unsigned long s) { return s; }
+ 
+     template <typename HostScalarType>
+     HostScalarType promote_if_host_scalar(int s) { return s; }
+ 
+     template <typename HostScalarType>
+     HostScalarType promote_if_host_scalar(unsigned int s) { return s; }
+ 
+   } //namespace tools
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/traits/handle.hpp
index c60e7c7,fa196cc..fe6a622
--- a/viennacl/traits/handle.hpp
+++ b/viennacl/traits/handle.hpp
@@@ -1,104 -1,245 +1,352 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_TRAITS_HANDLE_HPP_
 +#define VIENNACL_TRAITS_HANDLE_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file traits/handle.hpp
 +    @brief Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc.
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +
 +#ifdef __APPLE__
 +#include <OpenCL/cl.h>
 +#else
 +#include <CL/cl.h>
 +#endif
 +
 +namespace viennacl
 +{
 +  namespace traits
 +  {
 +    
 +    // Returns the OpenCL handle of a ViennaCL object
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(T & obj)
 +    {
 +      return obj.handle();
 +    }
 +
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_range<T> & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +    
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_range<T> const & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_slice<T> & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_slice<T> const & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_range<T> & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_range<T> const & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_slice<T> & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +    template <typename T>
 +    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_slice<T> const & obj)
 +    {
 +      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
 +    }
 +
 +  } //namespace traits
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_TRAITS_HANDLE_HPP_
+ #define VIENNACL_TRAITS_HANDLE_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/traits/handle.hpp
+     @brief Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc.
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ 
+ #include "viennacl/backend/mem_handle.hpp"
+ 
+ namespace viennacl
+ {
+   namespace traits
+   {
+     //
+     // Generic memory handle
+     //
+     /** @brief Returns the generic memory handle of an object. Non-const version. */
+     template <typename T>
+     viennacl::backend::mem_handle & handle(T & obj)
+     {
+       return obj.handle();
+     }
+ 
+     /** @brief Returns the generic memory handle of an object. Const-version. */
+     template <typename T>
+     viennacl::backend::mem_handle const & handle(T const & obj)
+     {
+       return obj.handle();
+     }
+ 
+     /** \cond */
+     inline char   handle(char val)   { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline short  handle(short val)  { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline int    handle(int val)    { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline long   handle(long val)   { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline float  handle(float val)  { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline double handle(double val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::backend::mem_handle       & handle(viennacl::scalar_expression< const LHS, const RHS, OP> & obj)
+     {
+       return handle(obj.lhs());
+     }
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj);
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::backend::mem_handle const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj);
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::backend::mem_handle const & handle(viennacl::scalar_expression< const LHS, const RHS, OP> const & obj)
+     {
+       return handle(obj.lhs());
+     }
+ 
+     // proxy objects require extra care (at the moment)
+     template <typename T>
+     viennacl::backend::mem_handle       & handle(viennacl::vector_base<T>       & obj)
+     {
+       return obj.handle();
+     }
+ 
+     template <typename T>
+     viennacl::backend::mem_handle const & handle(viennacl::vector_base<T> const & obj)
+     {
+       return obj.handle();
+     }
+ 
+ 
+ 
+     template <typename T>
+     viennacl::backend::mem_handle       & handle(viennacl::matrix_range<T>       & obj)
+     {
+       return obj.get().handle();
+     }
+ 
+     template <typename T>
+     viennacl::backend::mem_handle const & handle(viennacl::matrix_range<T> const & obj)
+     {
+       return obj.get().handle();
+     }
+ 
+ 
+     template <typename T>
+     viennacl::backend::mem_handle       & handle(viennacl::matrix_slice<T>      & obj)
+     {
+       return obj.get().handle();
+     }
+ 
+     template <typename T>
+     viennacl::backend::mem_handle const & handle(viennacl::matrix_slice<T> const & obj)
+     {
+       return obj.get().handle();
+     }
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::backend::mem_handle const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj)
+     {
+       return handle(obj.lhs());
+     }
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj)
+     {
+       return handle(obj.lhs());
+     }
+ 
+     /** \endcond */
+ 
+     //
+     // RAM handle extraction
+     //
+     /** @brief Generic helper routine for extracting the RAM handle of a ViennaCL object. Non-const version. */
+     template <typename T>
+     typename viennacl::backend::mem_handle::ram_handle_type & ram_handle(T & obj)
+     {
+       return viennacl::traits::handle(obj).ram_handle();
+     }
+ 
+     /** @brief Generic helper routine for extracting the RAM handle of a ViennaCL object. Const version. */
+     template <typename T>
+     typename viennacl::backend::mem_handle::ram_handle_type const & ram_handle(T const & obj)
+     {
+       return viennacl::traits::handle(obj).ram_handle();
+     }
+ 
+     /** \cond */
+     inline viennacl::backend::mem_handle::ram_handle_type & ram_handle(viennacl::backend::mem_handle & h)
+     {
+       return h.ram_handle();
+     }
+ 
+     inline viennacl::backend::mem_handle::ram_handle_type const & ram_handle(viennacl::backend::mem_handle const & h)
+     {
+       return h.ram_handle();
+     }
+     /** \endcond */
+ 
+     //
+     // OpenCL handle extraction
+     //
+ #ifdef VIENNACL_WITH_OPENCL
+     /** @brief Generic helper routine for extracting the OpenCL handle of a ViennaCL object. Non-const version. */
+     template <typename T>
+     viennacl::ocl::handle<cl_mem> & opencl_handle(T & obj)
+     {
+       return viennacl::traits::handle(obj).opencl_handle();
+     }
+ 
+     /** @brief Generic helper routine for extracting the OpenCL handle of a ViennaCL object. Const version. */
+     template <typename T>
+     viennacl::ocl::handle<cl_mem> const & opencl_handle(T const & obj)
+     {
+       return viennacl::traits::handle(obj).opencl_handle();
+     }
+ 
+     inline cl_char   opencl_handle(char            val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_short  opencl_handle(short           val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_int    opencl_handle(int             val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_long   opencl_handle(long            val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_uchar  opencl_handle(unsigned char   val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_ushort opencl_handle(unsigned short  val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_uint   opencl_handle(unsigned int    val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline cl_ulong  opencl_handle(unsigned long   val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline float     opencl_handle(float           val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+     inline double    opencl_handle(double          val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+ 
+ 
+ #endif
+ 
+ 
+ 
+     //
+     // Active handle ID
+     //
+     /** @brief Returns an ID for the currently active memory domain of an object */
+     template <typename T>
+     viennacl::memory_types active_handle_id(T const & obj)
+     {
+       return handle(obj).get_active_handle_id();
+     }
+ 
+     /** \cond */
+     template <typename T>
+     viennacl::memory_types active_handle_id(circulant_matrix<T> const &) { return OPENCL_MEMORY; }
+ 
+     template <typename T>
+     viennacl::memory_types active_handle_id(hankel_matrix<T> const &) { return OPENCL_MEMORY; }
+ 
+     template <typename T>
+     viennacl::memory_types active_handle_id(toeplitz_matrix<T> const &) { return OPENCL_MEMORY; }
+ 
+     template <typename T>
+     viennacl::memory_types active_handle_id(vandermonde_matrix<T> const &) { return OPENCL_MEMORY; }
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::memory_types active_handle_id(viennacl::vector_expression<LHS, RHS, OP> const &);
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::memory_types active_handle_id(viennacl::scalar_expression<LHS, RHS, OP> const & obj)
+     {
+       return active_handle_id(obj.lhs());
+     }
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::memory_types active_handle_id(viennacl::vector_expression<LHS, RHS, OP> const & obj)
+     {
+       return active_handle_id(obj.lhs());
+     }
+ 
+     template <typename LHS, typename RHS, typename OP>
+     viennacl::memory_types active_handle_id(viennacl::matrix_expression<LHS, RHS, OP> const & obj)
+     {
+       return active_handle_id(obj.lhs());
+     }
+     /** \endcond */
+ 
+   } //namespace traits
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/traits/size.hpp
index 665c835,4c8bd08..f95ba78
--- a/viennacl/traits/size.hpp
+++ b/viennacl/traits/size.hpp
@@@ -1,241 -1,320 +1,564 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_TRAITS_SIZE_HPP_
 +#define VIENNACL_TRAITS_SIZE_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file size.hpp
 +    @brief Generic size and resize functionality for different vector and matrix types
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +#include "viennacl/meta/result_of.hpp"
 +
 +#ifdef VIENNACL_HAVE_UBLAS  
 +#include <boost/numeric/ublas/matrix_sparse.hpp>
 +#include <boost/numeric/ublas/matrix.hpp>
 +#endif
 +
 +#ifdef VIENNACL_HAVE_EIGEN  
 +#include <Eigen/Core>
 +#include <Eigen/Sparse>
 +#endif
 +
 +#ifdef VIENNACL_HAVE_MTL4
 +#include <boost/numeric/mtl/mtl.hpp>
 +#endif
 +
 +#include <vector>
 +#include <map>
 +
 +namespace viennacl
 +{
 +
 +  namespace traits
 +  {
 +    //
 +    // Resize: Change the size of vectors and matrices
 +    //
 +    template <typename MatrixType>
 +    void resize(MatrixType & matrix, size_t rows, size_t cols)
 +    {
 +      matrix.resize(rows, cols); 
 +    }
 +    
 +    template <typename VectorType>
 +    void resize(VectorType & vec, size_t new_size)
 +    {
 +      vec.resize(new_size); 
 +    }
 +    
 +    #ifdef VIENNACL_HAVE_UBLAS  
 +    //ublas needs separate treatment:
 +    template <typename ScalarType>
 +    void resize(boost::numeric::ublas::compressed_matrix<ScalarType> & matrix,
 +                size_t rows,
 +                size_t cols)
 +    {
 +      matrix.resize(rows, cols, false); //Note: omitting third parameter leads to compile time error (not implemented in ublas <= 1.42) 
 +    }
 +    #endif  
 +    
 +    
 +    #ifdef VIENNACL_HAVE_MTL4
 +    template <typename ScalarType>
 +    void resize(mtl::compressed2D<ScalarType> & matrix,
 +                size_t rows,
 +                size_t cols)
 +    {
 +      matrix.change_dim(rows, cols);
 +    }
 +    
 +    template <typename ScalarType>
 +    void resize(mtl::dense_vector<ScalarType> & vec,
 +                size_t new_size)
 +    {
 +      vec.change_dim(new_size);
 +    }
 +    #endif      
 +
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    inline void resize(Eigen::MatrixXf & m,
 +                       std::size_t new_rows,
 +                       std::size_t new_cols)
 +    {
 +      m.resize(new_rows, new_cols);
 +    }
 +    
 +    inline void resize(Eigen::MatrixXd & m,
 +                       std::size_t new_rows,
 +                       std::size_t new_cols)
 +    {
 +      m.resize(new_rows, new_cols);
 +    }
 +    
 +    template <typename T, int options>
 +    inline void resize(Eigen::SparseMatrix<T, options> & m,
 +                       std::size_t new_rows,
 +                       std::size_t new_cols)
 +    {
 +      m.resize(new_rows, new_cols);
 +    }    
 +    
 +    inline void resize(Eigen::VectorXf & v,
 +                       std::size_t new_size)
 +    {
 +      v.resize(new_size);
 +    }
 +    
 +    inline void resize(Eigen::VectorXd & v,
 +                       std::size_t new_size)
 +    {
 +      v.resize(new_size);
 +    }
 +    #endif
 +
 +
 +    //
 +    // size: Returns the length of vectors
 +    //
 +    template <typename VectorType>
 +    typename result_of::size_type<VectorType>::type size(VectorType const & vec)
 +    {
 +      return vec.size(); 
 +    }
 +
 +    #ifdef VIENNACL_HAVE_MTL4
 +    template <typename ScalarType>
 +    typename result_of::size_type< mtl::dense_vector<ScalarType> >::type
 +    size(mtl::dense_vector<ScalarType> const & vec) { return vec.used_memory(); }
 +    #endif
 +    
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    inline std::size_t size(Eigen::VectorXf const & v) { return v.rows(); }
 +    inline std::size_t size(Eigen::VectorXd const & v) { return v.rows(); }
 +    #endif
 +
 +    //
 +    // size1: No. of rows for matrices
 +    //
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    size1(MatrixType const & mat) { return mat.size1(); }
 +
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    inline std::size_t size1(Eigen::MatrixXf const & m) { return m.rows(); }
 +    inline std::size_t size1(Eigen::MatrixXd const & m) { return m.rows(); }
 +    template <typename T, int options>
 +    inline std::size_t size1(Eigen::SparseMatrix<T, options> & m) { return m.rows(); }    
 +    #endif
 +
 +    //
 +    // size2: No. of columns for matrices
 +    //
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    size2(MatrixType const & mat) { return mat.size2(); }
 + 
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    inline std::size_t size2(Eigen::MatrixXf const & m) { return m.cols(); }
 +    inline std::size_t size2(Eigen::MatrixXd const & m) { return m.cols(); }
 +    template <typename T, int options>
 +    inline std::size_t size2(Eigen::SparseMatrix<T, options> & m) { return m.cols(); }    
 +    #endif
 + 
 +    //
 +    // internal_size: Returns the internal (padded) length of vectors
 +    //
 +    template <typename VectorType>
 +    typename result_of::size_type<VectorType>::type 
 +    internal_size(VectorType const & vec)
 +    {
 +      return vec.internal_size(); 
 +    }
 +
 +    template <typename VectorType>
 +    typename result_of::size_type<VectorType>::type 
 +    internal_size(viennacl::vector_range<VectorType> const & vec)
 +    {
 +      return vec.get().internal_size(); 
 +    }
 +    
 +    template <typename VectorType>
 +    typename result_of::size_type<VectorType>::type 
 +    internal_size(viennacl::vector_slice<VectorType> const & vec)
 +    {
 +      return vec.get().internal_size(); 
 +    }
 +
 +
 +    //
 +    // internal_size1: No. of internal (padded) rows for matrices
 +    //
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    internal_size1(MatrixType const & mat) { return mat.internal_size1(); }
 +
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    internal_size1(viennacl::matrix_range<MatrixType> const & mat) { return mat.get().internal_size1(); }
 +
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    internal_size1(viennacl::matrix_slice<MatrixType> const & mat) { return mat.get().internal_size1(); }
 +
 +
 +    //
 +    // internal_size2: No. of internal (padded) columns for matrices
 +    //
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    internal_size2(MatrixType const & mat) { return mat.internal_size2(); }
 + 
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    internal_size2(viennacl::matrix_range<MatrixType> const & mat) { return mat.get().internal_size2(); }
 +
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    internal_size2(viennacl::matrix_slice<MatrixType> const & mat) { return mat.get().internal_size2(); }
 + 
 +  } //namespace traits
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_TRAITS_SIZE_HPP_
+ #define VIENNACL_TRAITS_SIZE_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/traits/size.hpp
+     @brief Generic size and resize functionality for different vector and matrix types
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ #include "viennacl/meta/result_of.hpp"
+ #include "viennacl/meta/predicate.hpp"
+ 
+ #ifdef VIENNACL_WITH_UBLAS
+ #include <boost/numeric/ublas/matrix_sparse.hpp>
+ #include <boost/numeric/ublas/matrix.hpp>
+ #endif
+ 
+ #ifdef VIENNACL_WITH_EIGEN
+ #include <Eigen/Core>
+ #include <Eigen/Sparse>
+ #endif
+ 
+ #ifdef VIENNACL_WITH_MTL4
+ #include <boost/numeric/mtl/mtl.hpp>
+ #endif
+ 
+ #include <vector>
+ #include <map>
+ 
+ namespace viennacl
+ {
+ 
+   namespace traits
+   {
+     //
+     // Resize: Change the size of vectors and matrices
+     //
+     /** @brief Generic resize routine for resizing a matrix (ViennaCL, uBLAS, etc.) to a new size/dimension */
+     template <typename MatrixType>
+     void resize(MatrixType & matrix, vcl_size_t rows, vcl_size_t cols)
+     {
+       matrix.resize(rows, cols);
+     }
+ 
+     /** @brief Generic resize routine for resizing a vector (ViennaCL, uBLAS, etc.) to a new size */
+     template <typename VectorType>
+     void resize(VectorType & vec, vcl_size_t new_size)
+     {
+       vec.resize(new_size);
+     }
+ 
+     /** \cond */
+     #ifdef VIENNACL_WITH_UBLAS
+     //ublas needs separate treatment:
+     template <typename ScalarType>
+     void resize(boost::numeric::ublas::compressed_matrix<ScalarType> & matrix,
+                 vcl_size_t rows,
+                 vcl_size_t cols)
+     {
+       matrix.resize(rows, cols, false); //Note: omitting third parameter leads to compile time error (not implemented in ublas <= 1.42)
+     }
+     #endif
+ 
+ 
+     #ifdef VIENNACL_WITH_MTL4
+     template <typename ScalarType>
+     void resize(mtl::compressed2D<ScalarType> & matrix,
+                 vcl_size_t rows,
+                 vcl_size_t cols)
+     {
+       matrix.change_dim(rows, cols);
+     }
+ 
+     template <typename ScalarType>
+     void resize(mtl::dense_vector<ScalarType> & vec,
+                 vcl_size_t new_size)
+     {
+       vec.change_dim(new_size);
+     }
+     #endif
+ 
+     #ifdef VIENNACL_WITH_EIGEN
+     inline void resize(Eigen::MatrixXf & m,
+                        vcl_size_t new_rows,
+                        vcl_size_t new_cols)
+     {
+       m.resize(new_rows, new_cols);
+     }
+ 
+     inline void resize(Eigen::MatrixXd & m,
+                        vcl_size_t new_rows,
+                        vcl_size_t new_cols)
+     {
+       m.resize(new_rows, new_cols);
+     }
+ 
+     template <typename T, int options>
+     inline void resize(Eigen::SparseMatrix<T, options> & m,
+                        vcl_size_t new_rows,
+                        vcl_size_t new_cols)
+     {
+       m.resize(new_rows, new_cols);
+     }
+ 
+     inline void resize(Eigen::VectorXf & v,
+                        vcl_size_t new_size)
+     {
+       v.resize(new_size);
+     }
+ 
+     inline void resize(Eigen::VectorXd & v,
+                        vcl_size_t new_size)
+     {
+       v.resize(new_size);
+     }
+     #endif
+     /** \endcond */
+ 
+ 
+     //
+     // size: Returns the length of vectors
+     //
+     /** @brief Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) */
+     template <typename VectorType>
+     vcl_size_t size(VectorType const & vec)
+     {
+       return vec.size();
+     }
+ 
+     /** \cond */
+     template <typename SparseMatrixType, typename VectorType>
+     typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                   vcl_size_t >::type
+     size(vector_expression<const SparseMatrixType, const VectorType, op_prod> const & proxy)
+     {
+       return proxy.lhs().size1();
+     }
+ 
+     template <typename T, unsigned int A, typename VectorType>
+     vcl_size_t size(vector_expression<const circulant_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+ 
+     template <typename T, unsigned int A, typename VectorType>
+     vcl_size_t size(vector_expression<const hankel_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+ 
+     template <typename T, unsigned int A, typename VectorType>
+     vcl_size_t size(vector_expression<const toeplitz_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+ 
+     template <typename T, unsigned int A, typename VectorType>
+     vcl_size_t size(vector_expression<const vandermonde_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+ 
+     template <typename NumericT, typename F>
+     vcl_size_t size(vector_expression<const matrix_base<NumericT, F>, const vector_base<NumericT>, op_prod> const & proxy)  //matrix-vector product
+     {
+       return proxy.lhs().size1();
+     }
+ 
+     template <typename NumericT, typename F>
+     vcl_size_t size(vector_expression<const matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>,
+                                       const vector_base<NumericT>,
+                                       op_prod> const & proxy)  //transposed matrix-vector product
+     {
+       return proxy.lhs().lhs().size2();
+     }
+ 
+ 
+     #ifdef VIENNACL_WITH_MTL4
+     template <typename ScalarType>
+     vcl_size_t size(mtl::dense_vector<ScalarType> const & vec) { return vec.used_memory(); }
+     #endif
+ 
+     #ifdef VIENNACL_WITH_EIGEN
+     inline vcl_size_t size(Eigen::VectorXf const & v) { return v.rows(); }
+     inline vcl_size_t size(Eigen::VectorXd const & v) { return v.rows(); }
+     #endif
+ 
+     template <typename LHS, typename RHS, typename OP>
+     vcl_size_t size(vector_expression<LHS, RHS, OP> const & proxy)
+     {
+       return size(proxy.lhs());
+     }
+ 
+     template <typename LHS, typename RHS>
+     vcl_size_t size(vector_expression<LHS, const vector_tuple<RHS>, op_inner_prod> const & proxy)
+     {
+       return proxy.rhs().const_size();
+     }
+ 
+     /** \endcond */
+ 
+ 
+     //
+     // size1: No. of rows for matrices
+     //
+     /** @brief Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) */
+     template <typename MatrixType>
+     vcl_size_t
+     size1(MatrixType const & mat) { return mat.size1(); }
+ 
+     /** \cond */
+     template <typename RowType>
+     vcl_size_t
+     size1(std::vector< RowType > const & mat) { return mat.size(); }
+ 
+     #ifdef VIENNACL_WITH_EIGEN
+     inline vcl_size_t size1(Eigen::MatrixXf const & m) { return static_cast<vcl_size_t>(m.rows()); }
+     inline vcl_size_t size1(Eigen::MatrixXd const & m) { return static_cast<vcl_size_t>(m.rows()); }
+     template <typename T, int options>
+     inline vcl_size_t size1(Eigen::SparseMatrix<T, options> & m) { return static_cast<vcl_size_t>(m.rows()); }
+     #endif
+ 
+ #ifdef VIENNACL_WITH_MTL4
+     template <typename SCALARTYPE, typename T>
+     vcl_size_t size1(mtl::dense2D<SCALARTYPE, T> const & m) { return static_cast<vcl_size_t>(m.num_rows()); }
+     template <typename SCALARTYPE>
+     vcl_size_t size1(mtl::compressed2D<SCALARTYPE> const & m) { return static_cast<vcl_size_t>(m.num_rows()); }
+ #endif
+ 
+     /** \endcond */
+ 
+     //
+     // size2: No. of columns for matrices
+     //
+     /** @brief Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc.) */
+     template <typename MatrixType>
+     typename result_of::size_type<MatrixType>::type
+     size2(MatrixType const & mat) { return mat.size2(); }
+ 
+     /** \cond */
+     #ifdef VIENNACL_WITH_EIGEN
+     inline vcl_size_t size2(Eigen::MatrixXf const & m) { return m.cols(); }
+     inline vcl_size_t size2(Eigen::MatrixXd const & m) { return m.cols(); }
+     template <typename T, int options>
+     inline vcl_size_t size2(Eigen::SparseMatrix<T, options> & m) { return m.cols(); }
+     #endif
+ 
+ #ifdef VIENNACL_WITH_MTL4
+     template <typename SCALARTYPE, typename T>
+     vcl_size_t size2(mtl::dense2D<SCALARTYPE, T> const & m) { return static_cast<vcl_size_t>(m.num_cols()); }
+     template <typename SCALARTYPE>
+     vcl_size_t size2(mtl::compressed2D<SCALARTYPE> const & m) { return static_cast<vcl_size_t>(m.num_cols()); }
+ #endif
+     /** \endcond */
+ 
+     //
+     // internal_size: Returns the internal (padded) length of vectors
+     //
+     /** @brief Helper routine for obtaining the buffer length of a ViennaCL vector  */
+     template <typename NumericT>
+     vcl_size_t internal_size(vector_base<NumericT> const & vec)
+     {
+       return vec.internal_size();
+     }
+ 
+ 
+     //
+     // internal_size1: No. of internal (padded) rows for matrices
+     //
+     /** @brief Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix  */
+     template <typename NumericT, typename F>
+     vcl_size_t internal_size1(matrix_base<NumericT, F> const & mat) { return mat.internal_size1(); }
+ 
+ 
+     //
+     // internal_size2: No. of internal (padded) columns for matrices
+     //
+     /** @brief Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix  */
+     template <typename NumericT, typename F>
+     vcl_size_t internal_size2(matrix_base<NumericT, F> const & mat) { return mat.internal_size2(); }
+ 
+ 
+     template <typename LHS>
+     vcl_size_t size(vector_expression<LHS, const int, op_matrix_diag> const & proxy)
+     {
+       int k = proxy.rhs();
+       int A_size1 = static_cast<int>(size1(proxy.lhs()));
+       int A_size2 = static_cast<int>(size2(proxy.lhs()));
+ 
+       int row_depth = std::min(A_size1, A_size1 + k);
+       int col_depth = std::min(A_size2, A_size2 - k);
+ 
+       return std::min(row_depth, col_depth);
+     }
+ 
+     template <typename LHS>
+     vcl_size_t size(vector_expression<LHS, const unsigned int, op_row> const & proxy)
+     {
+       return size2(proxy.lhs());
+     }
+ 
+     template <typename LHS>
+     vcl_size_t size(vector_expression<LHS, const unsigned int, op_column> const & proxy)
+     {
+       return size1(proxy.lhs());
+     }
+ 
+ 
+   } //namespace traits
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/traits/start.hpp
index 0d70b8f,168f596..623ab4e
--- a/viennacl/traits/start.hpp
+++ b/viennacl/traits/start.hpp
@@@ -1,98 -1,101 +1,202 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_TRAITS_START_HPP_
 +#define VIENNACL_TRAITS_START_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file start.hpp
 +    @brief Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc.
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +
 +namespace viennacl
 +{
 +  namespace traits
 +  {
 +    //
 +    // start: Mostly for vectors
 +    //
 +    
 +    // Default: Try to get the start index from the .start() member function
 +    template <typename T>
 +    typename result_of::size_type<T>::type
 +    start(T const & obj)
 +    {
 +      return obj.start();
 +    }
 +    
 +    //ViennaCL vector leads to start index 0:
 +    template <typename ScalarType, unsigned int ALIGNMENT>
 +    typename result_of::size_type<viennacl::vector<ScalarType, ALIGNMENT> >::type
 +    start(viennacl::vector<ScalarType, ALIGNMENT> const & v)
 +    {
 +      return 0; 
 +    }
 +
 +
 +    //
 +    // start1: Row start index
 +    //
 +    
 +    // Default: Try to get the start index from the .start1() member function
 +    template <typename T>
 +    typename result_of::size_type<T>::type
 +    start1(T const & obj)
 +    {
 +      return obj.start1();
 +    }
 +
 +    //ViennaCL matrix leads to start index 0:
 +    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
 +    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
 +    start1(viennacl::matrix<ScalarType, F, ALIGNMENT> const & v)
 +    {
 +      return 0; 
 +    }
 +
 +
 +    //
 +    // start2: Column start index
 +    //
 +    template <typename T>
 +    typename result_of::size_type<T>::type
 +    start2(T const & obj)
 +    {
 +      return obj.start2();
 +    }
 +
 +    //ViennaCL matrix leads to start index 0:
 +    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
 +    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
 +    start2(viennacl::matrix<ScalarType, F, ALIGNMENT> const & v)
 +    {
 +      return 0; 
 +    }
 +    
 +
 +  } //namespace traits
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_TRAITS_START_HPP_
+ #define VIENNACL_TRAITS_START_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/traits/start.hpp
+     @brief Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc.
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ 
+ #include "viennacl/meta/result_of.hpp"
+ 
+ namespace viennacl
+ {
+   namespace traits
+   {
+     //
+     // start: Mostly for vectors
+     //
+ 
+     // Default: Try to get the start index from the .start() member function
+     template <typename T>
+     typename result_of::size_type<T>::type
+     start(T const & obj)
+     {
+       return obj.start();
+     }
+ 
+     //ViennaCL vector leads to start index 0:
+     template <typename ScalarType, unsigned int ALIGNMENT>
+     typename result_of::size_type<viennacl::vector<ScalarType, ALIGNMENT> >::type
+     start(viennacl::vector<ScalarType, ALIGNMENT> const &)
+     {
+       return 0;
+     }
+ 
+ 
+     //
+     // start1: Row start index
+     //
+ 
+     // Default: Try to get the start index from the .start1() member function
+     template <typename T>
+     typename result_of::size_type<T>::type
+     start1(T const & obj)
+     {
+       return obj.start1();
+     }
+ 
+     //ViennaCL matrix leads to start index 0:
+     template <typename ScalarType, typename F, unsigned int ALIGNMENT>
+     typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
+     start1(viennacl::matrix<ScalarType, F, ALIGNMENT> const &)
+     {
+       return 0;
+     }
+ 
+ 
+     //
+     // start2: Column start index
+     //
+     template <typename T>
+     typename result_of::size_type<T>::type
+     start2(T const & obj)
+     {
+       return obj.start2();
+     }
+ 
+     //ViennaCL matrix leads to start index 0:
+     template <typename ScalarType, typename F, unsigned int ALIGNMENT>
+     typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
+     start2(viennacl::matrix<ScalarType, F, ALIGNMENT> const &)
+     {
+       return 0;
+     }
+ 
+ 
+   } //namespace traits
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/traits/stride.hpp
index 43dab07,1b37507..42fb4e6
--- a/viennacl/traits/stride.hpp
+++ b/viennacl/traits/stride.hpp
@@@ -1,78 -1,75 +1,156 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_TRAITS_INC_HPP_
 +#define VIENNACL_TRAITS_INC_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file stride.hpp
 +    @brief Determines row and column increments for matrices and matrix proxies
 +*/
 +
 +#include <string>
 +#include <fstream>
 +#include <sstream>
 +#include "viennacl/forwards.h"
 +#include "viennacl/meta/result_of.hpp"
 +
 +
 +#include <vector>
 +#include <map>
 +
 +namespace viennacl
 +{
 +
 +  namespace traits
 +  {
 +
 +    //
 +    // inc: Increment for vectors. Defaults to 1
 +    //
 +    template <typename VectorType>
 +    typename result_of::size_type<VectorType>::type
 +    stride(VectorType const & vec) { return 1; }
 +
 +    template <typename VectorType>
 +    typename result_of::size_type<VectorType>::type
 +    stride(viennacl::vector_slice<VectorType> const & s) { return s.stride(); }
 +
 +    //
 +    // inc1: Row increment for matrices. Defaults to 1
 +    //
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    stride1(MatrixType const & mat) { return 1; }
 +
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    stride1(matrix_slice<MatrixType> const & s) { return s.stride1(); }
 +
 +    //
 +    // inc2: Column increment for matrices. Defaults to 1
 +    //
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    stride2(MatrixType const & mat) { return 1; }
 + 
 +    template <typename MatrixType>
 +    typename result_of::size_type<MatrixType>::type
 +    stride2(matrix_slice<MatrixType> const & s) { return s.stride2(); }
 +
 + 
 +  } //namespace traits
 +} //namespace viennacl
 +    
 +
 +#endif
++=======
+ #ifndef VIENNACL_TRAITS_STRIDE_HPP_
+ #define VIENNACL_TRAITS_STRIDE_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file viennacl/traits/stride.hpp
+     @brief Determines row and column increments for matrices and matrix proxies
+ */
+ 
+ #include <string>
+ #include <fstream>
+ #include <sstream>
+ #include "viennacl/forwards.h"
+ #include "viennacl/meta/result_of.hpp"
+ 
+ 
+ #include <vector>
+ #include <map>
+ 
+ namespace viennacl
+ {
+ 
+   namespace traits
+   {
+ 
+     //
+     // inc: Increment for vectors. Defaults to 1
+     //
+     template <typename T>
+     typename result_of::size_type< viennacl::vector_base<T> >::type
+     stride(viennacl::vector_base<T> const & s) { return s.stride(); }
+ 
+     //
+     // inc1: Row increment for matrices. Defaults to 1
+     //
+     //template <typename MatrixType>
+     //typename result_of::size_type<MatrixType>::type
+     //stride1(MatrixType const &) { return 1; }
+ 
+     template <typename NumericT, typename F>
+     typename result_of::size_type< matrix_base<NumericT, F> >::type
+     stride1(matrix_base<NumericT, F> const & s) { return s.stride1(); }
+ 
+     //
+     // inc2: Column increment for matrices. Defaults to 1
+     //
+     //template <typename MatrixType>
+     //typename result_of::size_type<MatrixType>::type
+     //stride2(MatrixType const &) { return 1; }
+ 
+     template <typename NumericT, typename F>
+     typename result_of::size_type< matrix_base<NumericT, F> >::type
+     stride2(matrix_base<NumericT, F> const & s) { return s.stride2(); }
+ 
+ 
+   } //namespace traits
+ } //namespace viennacl
+ 
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/vector.hpp
index 9af975e,23e4906..a9e5358
--- a/viennacl/vector.hpp
+++ b/viennacl/vector.hpp
@@@ -1,1723 -1,3240 +1,4966 @@@
++<<<<<<< HEAD
 +#ifndef VIENNACL_VECTOR_HPP_
 +#define VIENNACL_VECTOR_HPP_
 +
 +/* =========================================================================
 +   Copyright (c) 2010-2012, Institute for Microelectronics,
 +                            Institute for Analysis and Scientific Computing,
 +                            TU Wien.
 +
 +                            -----------------
 +                  ViennaCL - The Vienna Computing Library
 +                            -----------------
 +
 +   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 +               
 +   (A list of authors and contributors can be found in the PDF manual)
 +
 +   License:         MIT (X11), see file LICENSE in the base directory
 +============================================================================= */
 +
 +/** @file vector.hpp
 +    @brief The vector type with operator-overloads and proxy classes is defined here. 
 +           Linear algebra operations such as norms and inner products are located in linalg/vector_operations.hpp
 +*/
 +
 +
 +#include "viennacl/forwards.h"
 +#include "viennacl/ocl/backend.hpp"
 +#include "viennacl/scalar.hpp"
 +#include "viennacl/tools/tools.hpp"
 +#include "viennacl/tools/entry_proxy.hpp"
 +#include "viennacl/linalg/vector_operations.hpp"
 +
 +namespace viennacl
 +{
 +    
 +    /** @brief An expression template class that represents a binary operation that yields a vector
 +    *
 +    * In contrast to full expression templates as introduced by Veldhuizen, ViennaCL does not allow nested expressions.
 +    * The reason is that this requires automated GPU viennacl::ocl::kernel generation, which then has to be compiles just-in-time.
 +    * For performance-critical applications, one better writes the appropriate viennacl::ocl::kernels by hand.
 +    *
 +    * Assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
 +    *
 +    * @tparam LHS   left hand side operand
 +    * @tparam RHS   right hand side operand
 +    * @tparam OP    the operator
 +    */
 +    template <typename LHS, typename RHS, typename OP>
 +    class vector_expression
 +    {
 +      public:
 +        /** @brief Extracts the vector type from the two operands.
 +        */
 +        typedef typename viennacl::tools::VECTOR_EXTRACTOR<LHS, RHS>::ResultType    VectorType;
 +      
 +        vector_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
 +        
 +        /** @brief Get left hand side operand
 +        */
 +        LHS & lhs() const { return _lhs; }
 +        /** @brief Get right hand side operand
 +        */
 +        RHS & rhs() const { return _rhs; }
 +        
 +        /** @brief Returns the size of the result vector */
 +        std::size_t size() const { return viennacl::tools::VECTOR_SIZE_DEDUCER<LHS, RHS, OP>::size(_lhs, _rhs); }
 +        
 +      private:
 +        /** @brief The left hand side operand */
 +        LHS & _lhs;
 +        /** @brief The right hand side operand */
 +        RHS & _rhs;
 +    };
 +    
 +    /** @brief A STL-type const-iterator for vector elements. Elements can be accessed, but cannot be manipulated. VERY SLOW!!
 +    *
 +    * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
 +    * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
 +    * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
 +    * std::vector<float> temp;
 +    * copy(gpu_vector, temp);
 +    * for (std::vector<float>::const_iterator iter = temp.begin();
 +    *      iter != temp.end();
 +    *      ++iter)
 +    * {
 +    *   //do something
 +    * }
 +    * Note that you may obtain inconsistent data if entries of gpu_vector are manipulated elsewhere in the meanwhile.
 +    *
 +    * @tparam SCALARTYPE  The underlying floating point type (either float or double)
 +    * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
 +    */
 +    template<class SCALARTYPE, unsigned int ALIGNMENT>
 +    class const_vector_iterator
 +    {
 +        typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>    self_type;
 +      public:
 +        typedef scalar<SCALARTYPE>            value_type;
 +        typedef long                          difference_type;
 +        
 +        const_vector_iterator() {};
 +        /** @brief Constructor
 +        *   @param vec    The vector over which to iterate
 +        *   @param index  The starting index of the iterator
 +        */        
 +        const_vector_iterator(vector<SCALARTYPE, ALIGNMENT> const & vec,
 +                              cl_uint index,
 +                              cl_uint start = 0,
 +                              vcl_ptrdiff_t stride = 1) : elements_(vec.handle()), index_(index), start_(start), stride_(stride) {};
 +                              
 +        const_vector_iterator(viennacl::ocl::handle<cl_mem> const & elements,
 +                              cl_uint index,
 +                              cl_uint start = 0,
 +                              vcl_ptrdiff_t stride = 1) : elements_(elements), index_(index), start_(start), stride_(stride) {};
 +
 +        
 +        value_type operator*(void) const 
 +        { 
 +           value_type result;
 +           result = entry_proxy<SCALARTYPE>(start_ + index_ * stride_, elements_);
 +           return result;
 +        }
 +        self_type operator++(void) { index_ += stride_; return *this; }
 +        self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
 +        
 +        bool operator==(self_type const & other) const { return index_ == other.index_; }
 +        bool operator!=(self_type const & other) const { return index_ != other.index_; }
 +        
 +//        self_type & operator=(self_type const & other)
 +//        {
 +//           _index = other._index;
 +//           elements_ = other._elements;
 +//           return *this;
 +//        }   
 +
 +        difference_type operator-(self_type const & other) const { difference_type result = index_; return (result - static_cast<difference_type>(other.index_)); }
 +        self_type operator+(difference_type diff) const { return self_type(elements_, index_ + diff * stride_, start_, stride_); }
 +        
 +        //std::size_t index() const { return index_; }
 +        std::size_t offset() const { return start_ + index_ * stride_; }
 +        std::size_t stride() const { return stride_; }
 +        viennacl::ocl::handle<cl_mem> const & handle() const { return elements_; }
 +
 +      protected:
 +        /** @brief  The index of the entry the iterator is currently pointing to */
 +        viennacl::ocl::handle<cl_mem> elements_;
 +        std::size_t index_;  //offset from the beginning of elements_
 +        std::size_t start_;
 +        vcl_ptrdiff_t stride_;
 +    };
 +    
 +
 +    /** @brief A STL-type iterator for vector elements. Elements can be accessed and manipulated. VERY SLOW!!
 +    *
 +    * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
 +    * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
 +    * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
 +    * std::vector<float> temp;
 +    * copy(gpu_vector, temp);
 +    * for (std::vector<float>::const_iterator iter = temp.begin();
 +    *      iter != temp.end();
 +    *      ++iter)
 +    * {
 +    *   //do something
 +    * }
 +    * copy(temp, gpu_vector);
 +    * Note that you may obtain inconsistent data if you manipulate entries of gpu_vector in the meanwhile.
 +    *
 +    * @tparam SCALARTYPE  The underlying floating point type (either float or double)
 +    * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
 +    */
 +    template<class SCALARTYPE, unsigned int ALIGNMENT>
 +    class vector_iterator : public const_vector_iterator<SCALARTYPE, ALIGNMENT>
 +    {
 +        typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>  base_type;
 +        typedef vector_iterator<SCALARTYPE, ALIGNMENT>        self_type;
 +      public:
 +        vector_iterator() : base_type(){};
 +        vector_iterator(viennacl::ocl::handle<cl_mem> const & elements, std::size_t index)  : base_type(elements, index) {};
 +        /** @brief Constructor
 +        *   @param vec    The vector over which to iterate
 +        *   @param index  The starting index of the iterator
 +        */        
 +        vector_iterator(vector<SCALARTYPE, ALIGNMENT> & vec, cl_uint index) : base_type(vec, index) {};
 +        vector_iterator(base_type const & b) : base_type(b) {};
 +
 +        typename base_type::value_type operator*(void)  
 +        { 
 +           typename base_type::value_type result;
 +           result = entry_proxy<SCALARTYPE>(base_type::start_ + base_type::index_ * base_type::stride_, base_type::elements_); 
 +           return result;
 +        }
 +        
 +        viennacl::ocl::handle<cl_mem> handle() { return base_type::elements_; }
 +        
 +        operator base_type() const
 +        {
 +          return base_type(base_type::elements_, base_type::index_, base_type::start_, base_type::stride_);
 +        }
 +    };
 +
 +    // forward definition in forwards.h!
 +    /** @brief A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::vector
 +    *
 +    *  This is the basic vector type of ViennaCL. It is similar to std::vector and boost::numeric::ublas::vector and supports various linear algebra operations.
 +    * By default, the internal length of the vector is padded to a multiple of 'ALIGNMENT' in order to speed up several GPU viennacl::ocl::kernels.
 +    *
 +    * @tparam SCALARTYPE  The floating point type, either 'float' or 'double'
 +    * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
 +    */
 +    template<class SCALARTYPE, unsigned int ALIGNMENT>
 +    class vector
 +    {
 +      typedef vector<SCALARTYPE, ALIGNMENT>         self_type;
 +      
 +    public:
 +      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
 +      typedef vcl_size_t                                        size_type;
 +      typedef vcl_ptrdiff_t                                     difference_type;
 +      typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>      const_iterator;
 +      typedef vector_iterator<SCALARTYPE, ALIGNMENT>            iterator;
 +      
 +      static const int alignment = ALIGNMENT;
 +
 +      /** @brief Default constructor in order to be compatible with various containers.
 +      */
 +      vector() : size_(0) { viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::init();  }
 +
 +      /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT')
 +      *
 +      * @param vec_size   The length (i.e. size) of the vector.
 +      */
 +      explicit vector(size_type vec_size) : size_(vec_size)
 +      {
 +        viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::init(); 
 +        
 +        if (size_ > 0)
 +          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +        
 +        //force entries above size_ to zero:
 +        if (size_ < internal_size())
 +        {
 +          std::vector<SCALARTYPE> temp(internal_size() - size_);
 +          cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), elements_.get(), CL_TRUE, sizeof(SCALARTYPE)*size_, sizeof(SCALARTYPE)*(internal_size() - size_), &(temp[0]), 0, NULL, NULL);
 +          //assert(err == CL_SUCCESS);
 +          VIENNACL_ERR_CHECK(err);
 +        }
 +      }
 +
 +      /** @brief Create a vector from existing OpenCL memory
 +      *
 +      * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
 +      * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
 +      *
 +      * @param existing_mem   An OpenCL handle representing the memory
 +      * @param vec_size       The size of the vector. 
 +      */
 +      explicit vector(cl_mem existing_mem, size_type vec_size) : size_(vec_size),  elements_(existing_mem)
 +      {
 +        elements_.inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
 +      }
 +      
 +      template <typename LHS, typename RHS, typename OP>
 +      vector(vector_expression<LHS, RHS, OP> const & other) : size_(other.size())
 +      {
 +        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*other.size());
 +        *this = other;
 +      }
 +      
 +      /** @brief The copy constructor
 +      *
 +      * Entries of 'vec' are directly copied to this vector.
 +      */
 +      vector(const self_type & vec) :
 +        size_(vec.size())
 +      {
 +        viennacl::linalg::kernels::vector<SCALARTYPE, 1>::init(); 
 +        
 +        if (size() != 0)
 +        {
 +          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
 +          cl_int err;
 +          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
 +          //assert(err == CL_SUCCESS);
 +          VIENNACL_ERR_CHECK(err);
 +        }
 +      }
 +
 +      /** @brief Assignment operator. This vector is resized if 'vec' is of a different size.
 +      */
 +      self_type & operator=(const self_type & vec)
 +      {
 +        resize(vec.size());
 +        if (size() != 0)
 +        {
 +          cl_int err;
 +          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
 +          VIENNACL_ERR_CHECK(err);
 +        }
 +        return *this;
 +      }
 +
 +
 +      /** @brief Implementation of the operation v1 = alpha * v2, where alpha is a GPU scalar
 +      *
 +      * @param proxy  An expression template proxy class.
 +      */
 +      template <typename VectorType>   //use template to cover const/non-const of VectorType:
 +      self_type & operator = (const vector_expression< VectorType,
 +                                                       const scalar<SCALARTYPE>,
 +                                                       op_prod> & proxy)
 +      {
 +        resize(proxy.lhs().size());
 +        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
 +        viennacl::linalg::mult(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      /** @brief Implementation of the operation v1 = alpha * v2, where alpha is a CPU scalar
 +      *
 +      * @param proxy  An expression template proxy class.
 +      */
 +      template <typename VectorType>   //use template to cover const/non-const of VectorType:
 +      self_type & operator = (const vector_expression< VectorType,
 +                                                       const SCALARTYPE,
 +                                                       op_prod> & proxy)
 +      {
 +        resize(proxy.lhs().size());
 +        viennacl::linalg::mult(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      /** @brief Implementation of the operation v1 = v2 / alpha, where alpha is a GPU scalar
 +      *
 +      * @param proxy  An expression template proxy class.
 +      */
 +      template <typename VectorType>   //use template to cover const/non-const of VectorType:
 +      self_type & operator = (const vector_expression< VectorType,
 +                                                       const scalar<SCALARTYPE>,
 +                                                       op_div> & proxy)
 +      {
 +        resize(proxy.lhs().size());
 +        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
 +        viennacl::linalg::divide(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      /** @brief Implementation of the operation v1 = v2 / alpha, where alpha is a CPU scalar
 +      *
 +      * @param proxy  An expression template proxy class.
 +      */
 +      template <typename VectorType>   //use template to cover const/non-const of VectorType:
 +      self_type & operator = (const vector_expression< VectorType,
 +                                                       const SCALARTYPE,
 +                                                       op_div> & proxy)
 +      {
 +        resize(proxy.lhs().size());
 +        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
 +        viennacl::linalg::mult(proxy.lhs(), static_cast<SCALARTYPE>(1.0) / proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +      //v1 = v2 + v3; 
 +      /** @brief Implementation of the operation v1 = v2 + v3
 +      *
 +      * @param proxy  An expression template proxy class.
 +      */
 +      self_type & operator = (const vector_expression< const self_type,
 +                                                       const self_type,
 +                                                       op_add> & proxy)
 +      {
 +        assert(proxy.lhs().size() == size() && "Incompatible vector sizes!");
 +        //resize(proxy.lhs().size());
 +        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
 +        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +      
 +      //v1 = v2 - v3; 
 +      /** @brief Implementation of the operation v1 = v2 - v3
 +      *
 +      * @param proxy  An expression template proxy class.
 +      */
 +      self_type & operator = (const vector_expression< const self_type,
 +                                                       const self_type,
 +                                                       op_sub> & proxy)
 +      {
 +        assert(proxy.lhs().size() == size() && "Incompatible vector sizes!");
 +        //resize(proxy.lhs().size());
 +        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
 +        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
 +        return *this;
 +      }
 +
 +
 +      // assign vector range or vector slice (implemented in vector_proxy.hpp)
 +      self_type & operator = (const vector_range<self_type> &);
 +      self_type & operator = (const vector_slice<self_type> &);
 +      
 +      ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
 +
 +      //Note: The following operator overloads are defined in matrix_operations.hpp, compressed_matrix_operations.hpp and coordinate_matrix_operations.hpp
 +      //This is certainly not the nicest approach and will most likely by changed in the future, but it works :-)
 +      
 +      //matrix<>
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                const self_type,
 +                                                op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                          const self_type,
 +                                                                          op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      //transposed_matrix_proxy:
 +      /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                              const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                              op_trans >,
 +                                                     const self_type,
 +                                                     op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                               const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                               op_trans >,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                               const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                               op_trans >,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                            const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                            op_trans >,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename F, unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                            const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
 +                                                                            op_trans >,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +                                                                       
 +                                                                       
 +      //                                                                 
 +      //////////// compressed_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                     const self_type,
 +                                                     op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                                       const self_type,
 +                                                                       op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                                       const self_type,
 +                                                                       op_prod> & proxy);
 +
 +      //
 +      // coordinate_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                            const self_type,
 +                            op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      //
 +      // ell_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const ell_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                            const self_type,
 +                            op_prod> & proxy);
 +      
 +      //
 +      // hyb_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const hyb_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                            const self_type,
 +                            op_prod> & proxy);
 +      
 +      //
 +      // circulant_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                     const self_type,
 +                                                     op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +
 +      //
 +      // hankel_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                     const self_type,
 +                                                     op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      //
 +      // toeplitz_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                     const self_type,
 +                                                     op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      
 +      //
 +      // vandermonde_matrix<>
 +      //
 +      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                     const self_type,
 +                                                     op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator+=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +                                                
 +      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type & operator-=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                      const self_type,
 +                                                      op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator+(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <unsigned int MAT_ALIGNMENT>
 +      self_type operator-(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
 +                                                   const self_type,
 +                                                   op_prod> & proxy);
 +
 +      
 +      
 +      ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
 +
 +      //enlarge or reduce allocated memory and set unused memory to zero
 +      /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
 +      *
 +      *  @param new_size  The new size of the vector
 +      *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
 +      */
 +      void resize(size_type new_size, bool preserve = true)
 +      {
 +        assert(new_size > 0);
 +        
 +        if (new_size != size_)
 +        {
 +          std::size_t new_internal_size = viennacl::tools::roundUpToNextMultiple<std::size_t>(new_size, ALIGNMENT);
 +        
 +          std::vector<SCALARTYPE> temp(size_);
 +          if (preserve && size_ > 0)
 +            fast_copy(*this, temp);
 +          temp.resize(new_size);  //drop all entries above new_size
 +          temp.resize(new_internal_size); //enlarge to fit new internal size
 +          
 +          if (new_internal_size != internal_size())
 +          {
 +            elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*new_internal_size);
 +          }
 +          
 +          fast_copy(temp, *this);
 +          size_ = new_size;
 +        }
 +        
 +      }
 +      
 +
 +      //read-write access to an element of the vector
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<SCALARTYPE> operator()(size_type index)
 +      {
 +        return entry_proxy<SCALARTYPE>(index, elements_);
 +      }
 +
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<SCALARTYPE> operator[](size_type index)
 +      {
 +        return entry_proxy<SCALARTYPE>(index, elements_);
 +      }
 +
 +
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<SCALARTYPE> operator()(size_type index) const
 +      {
 +        scalar<SCALARTYPE> tmp;
 +        cl_int err;
 +        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), elements_, tmp.handle().get(), sizeof(SCALARTYPE)*index, 0, sizeof(SCALARTYPE), 0, NULL, NULL);
 +        //assert(err == CL_SUCCESS);
 +        VIENNACL_ERR_CHECK(err);
 +        return tmp;
 +      }
 +      
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<SCALARTYPE> operator[](size_type index) const
 +      {
 +        return operator()(index);
 +      }
 +      
 +      /** @brief Inplace addition of a vector
 +      */
 +      self_type & operator += (const self_type & vec)
 +      {
 +        viennacl::linalg::inplace_add(*this, vec);
 +        return *this;
 +      }
 +
 +      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
 +      */
 +      self_type & operator += (const vector_expression< const self_type,
 +                                                        const scalar<SCALARTYPE>,
 +                                                        op_prod> & proxy)
 +      {
 +        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +
 +      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a CPU scalar
 +      */
 +      self_type & operator += (const vector_expression< const self_type,
 +                                                        const SCALARTYPE,
 +                                                        op_prod> & proxy)
 +      {
 +        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +
 +      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
 +      */
 +      self_type & operator += (const vector_expression< const self_type,
 +                                                        const scalar<SCALARTYPE>,
 +                                                        op_div> & proxy)
 +      {
 +        viennacl::linalg::inplace_div_add(*this, proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +
 +
 +
 +      /** @brief Inplace subtraction of a vector
 +      */
 +      self_type & operator -= (const self_type & vec)
 +      {
 +        viennacl::linalg::inplace_sub(*this, vec);
 +        return *this;
 +      }
 +
 +      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a GPU scalar
 +      */
 +      self_type & operator -= (const vector_expression< const self_type,
 +                                                        const scalar<SCALARTYPE>,
 +                                                        op_prod> & proxy)
 +      {
 +        viennacl::linalg::inplace_mul_sub(*this, proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +
 +      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
 +      */
 +      self_type & operator -= (const vector_expression< const self_type,
 +                                                        const SCALARTYPE,
 +                                                        op_prod> & proxy)
 +      {
 +        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), -proxy.rhs());
 +        return *this;
 +      }
 +
 +      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
 +      */
 +      self_type & operator -= (const vector_expression< const self_type,
 +                                                        const scalar<SCALARTYPE>,
 +                                                        op_div> & proxy)
 +      {
 +        viennacl::linalg::inplace_div_sub(*this, proxy.lhs(), proxy.rhs());
 +        return *this;
 +      }
 +      
 +      
 +      
 +
 +      /** @brief Scales this vector by a CPU scalar value
 +      */
 +      self_type & operator *= (SCALARTYPE val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, val);
 +        return *this;
 +      }
 +
 +      /** @brief Scales this vector by a GPU scalar value
 +      */
 +      self_type & operator *= (scalar<SCALARTYPE> const & gpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, gpu_val);
 +        return *this;
 +      }
 +
 +      /** @brief Scales this vector by a CPU scalar value
 +      */
 +      self_type & operator /= (SCALARTYPE val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, static_cast<SCALARTYPE>(1) / val);
 +        return *this;
 +      }
 +      
 +      /** @brief Scales this vector by a CPU scalar value
 +      */
 +      self_type & operator /= (scalar<SCALARTYPE> const & gpu_val)
 +      {
 +        viennacl::linalg::inplace_divide(*this, gpu_val);
 +        return *this;
 +      }
 +      
 +      
 +      
 +      // free addition
 +      
 +      /** @brief Adds up two vectors
 +      */
 +      vector_expression< const self_type, const self_type, op_add>
 +      operator + (const self_type & vec) const
 +      {
 +        return vector_expression< const self_type, 
 +                                  const self_type,
 +                                  op_add>(*this, vec);
 +      }
 +      
 +      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a GPU scalar
 +      */
 +      self_type operator + (const vector_expression< const self_type,
 +                                                     const scalar<SCALARTYPE>,
 +                                                     op_prod> & proxy) const
 +      {
 +        vector<SCALARTYPE, ALIGNMENT> result(size_);
 +        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
 +        return result;
 +      }
 +
 +      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a CPU scalar
 +      */
 +      self_type operator + (const vector_expression< const self_type,
 +                                                     const SCALARTYPE,
 +                                                     op_prod> & proxy) const
 +      {
 +        vector<SCALARTYPE, ALIGNMENT> result(size_);
 +        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
 +        return result;
 +      }
 +
 +
 +      //
 +      // free subtraction:
 +      //
 +      /** @brief Implementation of    result = v1 - v2
 +      */
 +      vector_expression< const self_type, const self_type, op_sub>
 +      operator - (const self_type & vec) const
 +      {
 +        return vector_expression< const self_type, 
 +                                  const self_type,
 +                                  op_sub>(*this, vec);
 +      }
 +
 +
 +      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a GPU scalar
 +      */
 +      self_type operator - (const vector_expression< const self_type,
 +                                                     const scalar<SCALARTYPE>,
 +                                                     op_prod> & proxy) const
 +      {
 +        vector<SCALARTYPE, ALIGNMENT> result(size_);
 +        result = *this;
 +        viennacl::linalg::inplace_mul_sub(result, proxy.lhs(), proxy.rhs());
 +        return result;
 +      }
 +
 +      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a CPU scalar
 +      */
 +      self_type operator - (const vector_expression< const self_type,
 +                                                     const SCALARTYPE,
 +                                                     op_prod> & proxy) const
 +      {
 +        vector<SCALARTYPE, ALIGNMENT> result(size_);
 +        result = *this;
 +        viennacl::linalg::inplace_mul_add(result, proxy.lhs(), -proxy.rhs());
 +        return result;
 +      }
 +
 +      
 +      //free multiplication
 +      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
 +      */
 +      vector_expression< const self_type, const SCALARTYPE, op_prod> 
 +      operator * (SCALARTYPE value) const
 +      {
 +        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_prod>(*this, value);
 +      }
 +
 +      /** @brief Scales the vector by a GPU scalar 'alpha' and returns an expression template
 +      */
 +      vector_expression< const self_type, const scalar<SCALARTYPE>, op_prod> 
 +      operator * (scalar<SCALARTYPE> const & value) const
 +      {
 +        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_prod>(*this, value);
 +      }
 +
 +      //free division
 +      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
 +      */
 +      vector_expression< const self_type, const SCALARTYPE, op_div> 
 +      operator / (SCALARTYPE value) const
 +      {
 +        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_div>(*this, value);
 +      }
 +
 +      /** @brief Scales the vector by a GPU scalar 'alpha' and returns an expression template
 +      */
 +      vector_expression< const self_type, const scalar<SCALARTYPE>, op_div> 
 +      operator / (scalar<SCALARTYPE> const & value) const
 +      {
 +        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_div>(*this, value);
 +      }
 +      
 +      
 +      //// iterators:
 +      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
 +      iterator begin()
 +      {
 +        return iterator(*this, 0);
 +      }
 +
 +      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
 +      iterator end()
 +      {
 +        return iterator(*this, size());
 +      }
 +
 +      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
 +      const_iterator begin() const
 +      {
 +        return const_iterator(*this, 0);
 +      }
 +
 +      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
 +      const_iterator end() const
 +      {
 +        return const_iterator(*this, size());
 +      }
 +
 +      /** @brief Swaps the entries of the two vectors
 +      */
 +      self_type & swap(self_type & other)
 +      {
 +        swap(*this, other);
 +        return *this;
 +      };
 +      
 +      /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
 +      */ 
 +      self_type & fast_swap(self_type & other) 
 +      { 
 +        assert(this->size_ == other.size_); 
 +        this->elements_.swap(other.elements_); 
 +        return *this; 
 +      };       
 +      
 +      /** @brief Returns the length of the vector (cf. std::vector)
 +      */
 +      size_type size() const { return size_; }
 +      
 +      /** @brief Returns the maximum possible size of the vector, which is given by 128 MByte due to limitations by OpenCL.
 +      */
 +      size_type max_size() const
 +      {
 +        return (128*1024*1024) / sizeof(SCALARTYPE);  //128 MB is maximum size of memory chunks in OpenCL!
 +      }
 +      /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'ALIGNMENT'
 +      */
 +      size_type internal_size() const { return viennacl::tools::roundUpToNextMultiple<size_type>(size_, ALIGNMENT); }
 +      
 +      /** @brief Returns true is the size is zero */
 +      bool empty() { return size_ == 0; }
 +      
 +      /** @brief Returns the OpenCL memory viennacl::ocl::handle. Typically used for launching compute viennacl::ocl::kernels */
 +      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
 +
 +      /** @brief Resets all entries to zero. Does not change the size of the vector.
 +      */
 +      void clear()
 +      {
 +        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "clear");
 +        
 +        viennacl::ocl::enqueue(k(elements_,
 +                                 cl_uint(0),
 +                                 cl_uint(1),  //increment
 +                                 cl_uint(internal_size()))
 +                              );
 +      }
 +      //void swap(vector & other){}
 +      
 +
 +      //TODO: Think about implementing the following public member functions
 +      //void insert_element(unsigned int i, SCALARTYPE val){}
 +      //void erase_element(unsigned int i){}
 +      
 +    private:
 +      cl_uint size_;
 +      viennacl::ocl::handle<cl_mem> elements_;
 +    }; //vector
 +    
 +
 +    //
 +    //////////////////// Copy from GPU to CPU //////////////////////////////////
 +    //
 +    
 +    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
 +    *
 +    * @param gpu_begin  GPU constant iterator pointing to the beginning of the gpu vector (STL-like)
 +    * @param gpu_end    GPU constant iterator pointing to the end of the vector (STL-like)
 +    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
 +    void copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
 +              const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
 +              CPU_ITERATOR cpu_begin )
 +    {
 +      assert(gpu_end - gpu_begin >= 0);
 +      if (gpu_end - gpu_begin != 0)
 +      {
 +        std::vector<SCALARTYPE> temp_buffer(gpu_end - gpu_begin);
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         gpu_begin.handle().get(), CL_TRUE, 0, 
 +                                         sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
 +                                         &(temp_buffer[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        viennacl::ocl::get_queue().finish();
 +        
 +        //now copy entries to cpu_vec:
 +        std::copy(temp_buffer.begin(), temp_buffer.end(), cpu_begin);
 +      }
 +    }
 +
 +    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
 +    *
 +    * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
 +    * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
 +    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
 +    void copy(const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
 +              const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
 +              CPU_ITERATOR cpu_begin )
 +
 +    {
 +      copy(const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin),
 +           const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_end),
 +           cpu_begin);
 +    }
 +    
 +    /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
 +    *
 +    * @param gpu_vec    A gpu vector
 +    * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
 +    void copy(vector<SCALARTYPE, ALIGNMENT> const & gpu_vec,
 +              CPUVECTOR & cpu_vec )
 +    {
 +      viennacl::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
 +    }
 +
 +    //from gpu to cpu. Type assumption: cpu_vec lies in a linear memory chunk
 +    /** @brief STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
 +    *
 +    * This method is faster than the plain copy() function, because entries are
 +    * directly written to the cpu vector, starting with &(*cpu.begin()) However,
 +    * keep in mind that the cpu type MUST represent a linear piece of
 +    * memory, otherwise you will run into undefined behavior.
 +    *
 +    * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
 +    * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
 +    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
 +    void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
 +                   const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
 +                   CPU_ITERATOR cpu_begin )
 +    {
 +      if (gpu_begin != gpu_end)
 +      {
 +        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                         gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.offset(),
 +                                         sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
 +                                         &(*cpu_begin), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        viennacl::ocl::get_queue().finish();
 +      }
 +    }
 +
 +    /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
 +    *
 +    * @param gpu_vec    A gpu vector.
 +    * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
 +    void fast_copy(vector<SCALARTYPE, ALIGNMENT> const & gpu_vec,
 +                   CPUVECTOR & cpu_vec )
 +    {
 +      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
 +    }
 +
 +
 +
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    template <unsigned int ALIGNMENT>
 +    void copy(vector<float, ALIGNMENT> const & gpu_vec,
 +              Eigen::VectorXf & eigen_vec)
 +    {
 +      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
 +    }
 +    
 +    template <unsigned int ALIGNMENT>
 +    void copy(vector<double, ALIGNMENT> & gpu_vec,
 +              Eigen::VectorXd & eigen_vec)
 +    {
 +      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
 +    }
 +    #endif
 +
 +
 +    //
 +    //////////////////// Copy from CPU to GPU //////////////////////////////////
 +    //
 +
 +    //from cpu to gpu. Safe assumption: cpu_vector does not necessarily occupy a linear memory segment, but is not larger than the allocated memory on the GPU
 +    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
 +    *
 +    * @param cpu_begin  CPU iterator pointing to the beginning of the gpu vector (STL-like)
 +    * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
 +    * @param gpu_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
 +    void copy(CPU_ITERATOR const & cpu_begin,
 +              CPU_ITERATOR const & cpu_end,
 +              vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
 +    {
 +      assert(cpu_end - cpu_begin > 0);
 +      if (cpu_begin != cpu_end)
 +      {
 +        //we require that the size of the gpu_vector is larger or equal to the cpu-size
 +        std::vector<SCALARTYPE> temp_buffer(cpu_end - cpu_begin);
 +        std::copy(cpu_begin, cpu_end, temp_buffer.begin());
 +        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.offset(),
 +                                          sizeof(SCALARTYPE)*(cpu_end - cpu_begin),
 +                                          &(temp_buffer[0]), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +    }
 +
 +    // for things like copy(std_vec.begin(), std_vec.end(), vcl_vec.begin() + 1);
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
 +    void copy(CPU_ITERATOR const & cpu_begin,
 +              CPU_ITERATOR const & cpu_end,
 +              const_vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
 +    {
 +      copy(cpu_begin, cpu_end, vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin));
 +    }
 +
 +    /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
 +    *
 +    * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
 +    * @param gpu_vec    The gpu vector.
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
 +    void copy(const CPUVECTOR & cpu_vec, vector<SCALARTYPE, ALIGNMENT> & gpu_vec)
 +    {
 +      viennacl::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
 +    }
 +
 +    /** @brief STL-like transfer of a CPU vector to the GPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
 +    *
 +    * This method is faster than the plain copy() function, because entries are
 +    * directly read from the cpu vector, starting with &(*cpu.begin()). However,
 +    * keep in mind that the cpu type MUST represent a linear piece of
 +    * memory, otherwise you will run into undefined behavior.
 +    *
 +    * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
 +    * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
 +    * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
 +    void fast_copy(CPU_ITERATOR const & cpu_begin,
 +                   CPU_ITERATOR const & cpu_end,
 +                   vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
 +    {
 +      if (cpu_begin != cpu_end)
 +      {
 +        //we require that the size of the gpu_vector is larger or equal to the cpu-size
 +        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), 
 +                                          gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE) * gpu_begin.offset(), 
 +                                          sizeof(SCALARTYPE)*(cpu_end - cpu_begin), &(*cpu_begin), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +    }
 +
 +
 +    /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
 +    *
 +    * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
 +    * @param gpu_vec    The gpu vector.
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
 +    void fast_copy(const CPUVECTOR & cpu_vec, vector<SCALARTYPE, ALIGNMENT> & gpu_vec)
 +    {
 +      viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
 +    }
 +
 +    #ifdef VIENNACL_HAVE_EIGEN
 +    template <unsigned int ALIGNMENT>
 +    void copy(Eigen::VectorXf const & eigen_vec,
 +              vector<float, ALIGNMENT> & gpu_vec)
 +    {
 +      std::vector<float> entries(eigen_vec.size());
 +      for (size_t i = 0; i<entries.size(); ++i)
 +        entries[i] = eigen_vec(i);
 +      viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
 +    }
 +    
 +    template <unsigned int ALIGNMENT>
 +    void copy(Eigen::VectorXd const & eigen_vec,
 +              vector<double, ALIGNMENT> & gpu_vec)
 +    {
 +      std::vector<double> entries(eigen_vec.size());
 +      for (size_t i = 0; i<entries.size(); ++i)
 +        entries[i] = eigen_vec(i);
 +      viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
 +    }
 +    #endif
 +    
 +
 +
 +    //
 +    //////////////////// Copy from GPU to GPU //////////////////////////////////
 +    //
 +    /** @brief Copy (parts of a) GPU vector to another GPU vector
 +    *
 +    * @param gpu_src_begin    GPU iterator pointing to the beginning of the gpu vector (STL-like)
 +    * @param gpu_src_end      GPU iterator pointing to the end of the vector (STL-like)
 +    * @param gpu_dest_begin   Output iterator for the gpu vector. The gpu_dest vector must be at least as long as the gpu_src vector!
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
 +    void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
 +              const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
 +              vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
 +    {
 +      assert(gpu_src_end - gpu_src_begin >= 0);
 +      
 +      if (gpu_src_begin.stride() != 1)
 +      {
 +        std::cout << "ViennaCL ERROR: copy() for GPU->GPU not implemented for slices! Use operator= instead for the moment." << std::endl;
 +        exit(EXIT_FAILURE);
 +      }      
 +      else if (gpu_src_begin != gpu_src_end)
 +      {
 +        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                          gpu_src_begin.handle().get(),  //src handle
 +                                          gpu_dest_begin.handle().get(), //dest handle
 +                                          sizeof(SCALARTYPE) * gpu_src_begin.offset(), //src offset
 +                                          sizeof(SCALARTYPE) * gpu_dest_begin.offset(), //dest offset
 +                                          sizeof(SCALARTYPE) * (gpu_src_end.offset() - gpu_src_begin.offset()), //data length
 +                                          0, //no events
 +                                          NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +      }
 +    }
 +
 +    /** @brief Copy (parts of a) GPU vector to another GPU vector
 +    *
 +    * @param gpu_src_begin   GPU iterator pointing to the beginning of the gpu vector (STL-like)
 +    * @param gpu_src_end     GPU iterator pointing to the end of the vector (STL-like)
 +    * @param gpu_dest_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
 +    void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
 +              const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
 +              const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
 +    {
 +      copy(gpu_src_begin, gpu_src_end, vector_iterator<SCALARTYPE, ALIGNMENT_DEST>(gpu_dest_begin));
 +    }
 +
 +    /** @brief Transfer from a ViennaCL vector to another ViennaCL vector. Convenience wrapper for viennacl::linalg::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
 +    *
 +    * @param gpu_src_vec    A gpu vector
 +    * @param gpu_dest_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
 +    void copy(vector<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_vec,
 +              vector<SCALARTYPE, ALIGNMENT_DEST> & gpu_dest_vec )
 +    {
 +      viennacl::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
 +    } 
 +
 +
 +    
 +    
 +    
 +
 +    //global functions for handling vectors:
 +    /** @brief Output stream. Output format is ublas compatible.
 +    * @param s    STL output stream
 +    * @param val  The vector that should be printed
 +    */
 +    template<class SCALARTYPE, unsigned int ALIGNMENT>
 +    std::ostream & operator<<(std::ostream & s, vector<SCALARTYPE,ALIGNMENT> const & val)
 +    {
 +      viennacl::ocl::get_queue().finish();
 +      std::vector<SCALARTYPE> tmp(val.size());
 +      copy(val.begin(), val.end(), tmp.begin());
 +      std::cout << "[" << val.size() << "](";
 +      for (typename std::vector<SCALARTYPE>::size_type i=0; i<val.size(); ++i)
 +      {
 +        if (i > 0)
 +          s << ",";
 +        s << tmp[i];
 +      }
 +      std::cout << ")";
 +      return s;
 +    }
 +
 +    /** @brief Swaps the contents of two vectors, data is copied
 +    *
 +    * @param vec1   The first vector
 +    * @param vec2   The second vector
 +    */
 +    template<class SCALARTYPE, unsigned int ALIGNMENT>
 +    void swap(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
 +              viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
 +    {
 +      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2)
 +             && "Incompatible vector sizes in swap()");
 +
 +      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "swap");
 +
 +      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
 +                               cl_uint(viennacl::traits::start(vec1)),
 +                               cl_uint(viennacl::traits::stride(vec1)),
 +                               cl_uint(viennacl::traits::size(vec1)),
 +                               viennacl::traits::handle(vec2),
 +                               cl_uint(viennacl::traits::start(vec2)),
 +                               cl_uint(viennacl::traits::stride(vec2)),
 +                               cl_uint(viennacl::traits::size(vec2)))
 +                            );
 +    }
 +    
 +    /** @brief Swaps the content of two vectors by swapping OpenCL handles only, NO data is copied
 +    *
 +    * @param v1   The first vector
 +    * @param v2   The second vector
 +    */
 +    template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +    vector<SCALARTYPE, ALIGNMENT> & fast_swap(vector<SCALARTYPE, ALIGNMENT> & v1,
 +                                              vector<SCALARTYPE, ALIGNMENT> & v2) 
 +    { 
 +      return v1.fast_swap(v2);
 +    }       
 +    
 +    
 +    
 +    ////////// operations /////////////
 +    /** @brief Operator overload for the expression alpha * v1, where alpha is a host scalar (float or double) and v1 is a ViennaCL vector.
 +    *
 +    * @param value   The host scalar (float or double)
 +    * @param vec     A ViennaCL vector
 +    */
 +    template <typename SCALARTYPE, unsigned int A>
 +    vector_expression< const vector<SCALARTYPE, A>, const SCALARTYPE, op_prod> operator * (SCALARTYPE const & value, vector<SCALARTYPE, A> const & vec)
 +    {
 +      return vector_expression< const vector<SCALARTYPE, A>, const SCALARTYPE, op_prod>(vec, value);
 +    }
 +
 +    /** @brief Operator overload for the expression alpha * v1, where alpha is a ViennaCL scalar (float or double) and v1 is a ViennaCL vector.
 +    *
 +    * @param value   The ViennaCL scalar
 +    * @param vec     A ViennaCL vector
 +    */
 +    template <typename SCALARTYPE, unsigned int A>
 +    vector_expression< const vector<SCALARTYPE, A>, const scalar<SCALARTYPE>, op_prod> operator * (scalar<SCALARTYPE> const & value, vector<SCALARTYPE, A> const & vec)
 +    {
 +        return vector_expression< const vector<SCALARTYPE, A>, const scalar<SCALARTYPE>, op_prod>(vec, value);
 +    }
 +
 +
 +    //addition and subtraction of two vector_expressions:
 +    /** @brief Operator overload for the addition of two vector expressions.
 +    *
 +    * @param proxy1  Left hand side vector expression
 +    * @param proxy2  Right hand side vector expression
 +    */
 +    template <typename LHS1, typename RHS1, typename OP1,
 +              typename LHS2, typename RHS2, typename OP2>
 +    typename vector_expression< LHS1, RHS1, OP1>::VectorType
 +    operator + (vector_expression< LHS1, RHS1, OP1> const & proxy1,
 +                vector_expression< LHS2, RHS2, OP2> const & proxy2)
 +    {
 +      assert(proxy1.size() == proxy2.size());
 +      typename vector_expression< LHS1, RHS1, OP1>::VectorType result(proxy1.size());
 +      result = proxy1;
 +      result += proxy2;
 +      return result;
 +    }
 +
 +    /** @brief Operator overload for the subtraction of two vector expressions.
 +    *
 +    * @param proxy1  Left hand side vector expression
 +    * @param proxy2  Right hand side vector expression
 +    */
 +    template <typename LHS1, typename RHS1, typename OP1,
 +              typename LHS2, typename RHS2, typename OP2>
 +    typename vector_expression< LHS1, RHS1, OP1>::VectorType
 +    operator - (vector_expression< LHS1, RHS1, OP1> const & proxy1,
 +                vector_expression< LHS2, RHS2, OP2> const & proxy2)
 +    {
 +      assert(proxy1.size() == proxy2.size());
 +      typename vector_expression< LHS1, RHS1, OP1>::VectorType result(proxy1.size());
 +      result = proxy1;
 +      result -= proxy2;
 +      return result;
 +    }
 +    
 +    //////////// one vector expression from left /////////////////////////////////////////
 +    
 +    /** @brief Operator overload for the addition of a vector expression from the left, e.g. alpha * vec1 + vec2. Here, alpha * vec1 is wrapped into a vector_expression and then added to vec2.
 +    *
 +    * @param proxy   Left hand side vector expression
 +    * @param vec     Right hand side vector
 +    */
 +    template <typename SCALARTYPE, unsigned int A, typename LHS, typename RHS, typename OP>
 +    vector<SCALARTYPE, A> operator + (vector_expression< LHS, RHS, OP> const & proxy,
 +                                      vector<SCALARTYPE, A> const & vec)
 +    {
 +      assert(proxy.size() == vec.size());
 +      vector<SCALARTYPE, A> result(vec.size());
 +      result = proxy;
 +      result += vec;
 +      return result;
 +    }
 +
 +    /** @brief Operator overload for the subtraction of a vector expression from the left, e.g. alpha * vec1 + vec2. Here, alpha * vec1 is wrapped into a vector_expression and then added to vec2.
 +    *
 +    * @param proxy   Left hand side vector expression
 +    * @param vec     Right hand side vector
 +    */
 +    template <typename SCALARTYPE, unsigned int A, typename LHS, typename RHS, typename OP>
 +    vector<SCALARTYPE, A> operator - (vector_expression< LHS, RHS, OP> const & proxy,
 +                                      vector<SCALARTYPE, A> const & vec)
 +    {
 +      assert(proxy.size() == vec.size());
 +      vector<SCALARTYPE, A> result(vec.size());
 +      result = proxy;
 +      result -= vec;
 +      return result;
 +    }
 +
 +
 +    /** @brief Operator overload for the multiplication of a vector expression with a scalar from the right, e.g. (beta * vec1) * alpha. Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the right.
 +    *
 +    * @param proxy   Left hand side vector expression
 +    * @param val     Right hand side scalar
 +    */
 +    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
 +    vector<SCALARTYPE> operator * (vector_expression< LHS, RHS, OP> const & proxy,
 +                                   scalar<SCALARTYPE> const & val)
 +    {
 +      vector<SCALARTYPE> result(proxy.size());
 +      result = proxy;
 +      result *= val;
 +      return result;
 +    }
 +
 +    /** @brief Operator overload for the division of a vector expression by a scalar from the right, e.g. (beta * vec1) / alpha. Here, beta * vec1 is wrapped into a vector_expression and then divided by alpha.
 +    *
 +    * @param proxy   Left hand side vector expression
 +    * @param val     Right hand side scalar
 +    */
 +    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
 +    vector<SCALARTYPE> operator / (vector_expression< LHS, RHS, OP> const & proxy,
 +                                      scalar<SCALARTYPE> const & val)
 +    {
 +      vector<SCALARTYPE> result(proxy.size());
 +      result = proxy;
 +      result /= val;
 +      return result;
 +    }
 +
 +
 +    //////////// one vector expression from right (on scalar) ///////////////////////
 +    
 +    /** @brief Operator overload for the multiplication of a vector expression with a ViennaCL scalar from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
 +    *
 +    * @param val     Right hand side scalar
 +    * @param proxy   Left hand side vector expression
 +    */
 +    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
 +    vector<SCALARTYPE> operator * (scalar<SCALARTYPE> const & val,
 +                                   vector_expression< LHS, RHS, OP> const & proxy)
 +    {
 +      vector<SCALARTYPE> result(proxy.size());
 +      result = proxy;
 +      result *= val;
 +      return result;
 +    }
 +    
 +    /** @brief Operator overload for the multiplication of a vector expression with a host scalar (float or double) from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
 +    *
 +    * @param val     Right hand side scalar
 +    * @param proxy   Left hand side vector expression
 +    */
 +    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
 +    viennacl::vector<SCALARTYPE> operator * (SCALARTYPE val,
 +                                   viennacl::vector_expression< LHS, RHS, OP> const & proxy)
 +    {
 +      viennacl::vector<SCALARTYPE> result(proxy.size());
 +      result = proxy;
 +      result *= val;
 +      return result;
 +    }
 +
 +}
 +
 +#endif
++=======
+ #ifndef VIENNACL_VECTOR_HPP_
+ #define VIENNACL_VECTOR_HPP_
+ 
+ /* =========================================================================
+    Copyright (c) 2010-2014, Institute for Microelectronics,
+                             Institute for Analysis and Scientific Computing,
+                             TU Wien.
+    Portions of this software are copyright by UChicago Argonne, LLC.
+ 
+                             -----------------
+                   ViennaCL - The Vienna Computing Library
+                             -----------------
+ 
+    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+ 
+    (A list of authors and contributors can be found in the PDF manual)
+ 
+    License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+ 
+ /** @file  viennacl/vector.hpp
+     @brief The vector type with operator-overloads and proxy classes is defined here.
+            Linear algebra operations such as norms and inner products are located in linalg/vector_operations.hpp
+ */
+ 
+ 
+ #include "viennacl/forwards.h"
+ #include "viennacl/backend/memory.hpp"
+ #include "viennacl/scalar.hpp"
+ #include "viennacl/tools/tools.hpp"
+ #include "viennacl/tools/entry_proxy.hpp"
+ #include "viennacl/linalg/detail/op_executor.hpp"
+ #include "viennacl/linalg/vector_operations.hpp"
+ #include "viennacl/meta/result_of.hpp"
+ //#include "viennacl/rand/utils.hpp"
+ #include "viennacl/context.hpp"
+ #include "viennacl/traits/handle.hpp"
+ 
+ namespace viennacl
+ {
+ 
+   /** @brief Common base class for representing vectors where the entries are not all stored explicitly.
+     *
+     * Typical examples are zero_vector or scalar_vector.
+     */
+   template<typename SCALARTYPE>
+   class implicit_vector_base
+   {
+     protected:
+       typedef vcl_size_t        size_type;
+       implicit_vector_base(size_type s, vcl_size_t i, std::pair<SCALARTYPE, bool> v, viennacl::context ctx) : size_(s), index_(std::make_pair(true,i)), value_(v), ctx_(ctx){ }
+       implicit_vector_base(size_type s, std::pair<SCALARTYPE, bool> v, viennacl::context ctx) : size_(s), index_(std::make_pair(false,0)), value_(v), ctx_(ctx){ }
+ 
+     public:
+       typedef SCALARTYPE const & const_reference;
+       typedef SCALARTYPE cpu_value_type;
+ 
+       viennacl::context context() const { return ctx_; }
+ 
+       size_type size() const { return size_; }
+ 
+       cpu_value_type  value() const { return value_.first; }
+ 
+       bool is_value_static() const { return value_.second; }
+ 
+       vcl_size_t index() const { return index_.second; }
+ 
+       bool has_index() const { return index_.first; }
+ 
+       cpu_value_type operator()(size_type i) const {
+         if(index_.first)
+           return (i==index_.second)?value_.first:0;
+         return value_.first;
+       }
+ 
+       cpu_value_type operator[](size_type i) const {
+         if(index_.first)
+           return (i==index_.second)?value_.first:0;
+         return
+             value_.first;
+       }
+ 
+     protected:
+       size_type size_;
+       std::pair<bool, vcl_size_t> index_;
+       std::pair<SCALARTYPE, bool> value_;
+       viennacl::context ctx_;
+   };
+ 
+   /** @brief Represents a vector consisting of 1 at a given index and zeros otherwise.*/
+   template <typename SCALARTYPE>
+   class unit_vector : public implicit_vector_base<SCALARTYPE>
+   {
+       typedef implicit_vector_base<SCALARTYPE> base_type;
+     public:
+       typedef typename base_type::size_type size_type;
+       unit_vector(size_type s, size_type ind, viennacl::context ctx = viennacl::context()) : base_type(s, ind, std::make_pair(SCALARTYPE(1),true), ctx)
+       {
+         assert( (ind < s) && bool("Provided index out of range!") );
+       }
+   };
+ 
+ 
+   /** @brief Represents a vector consisting of zeros only. */
+   template <typename SCALARTYPE>
+   class zero_vector : public implicit_vector_base<SCALARTYPE>
+   {
+       typedef implicit_vector_base<SCALARTYPE> base_type;
+     public:
+       typedef typename base_type::size_type size_type;
+       typedef SCALARTYPE        const_reference;
+       zero_vector(size_type s, viennacl::context ctx = viennacl::context()) : base_type(s, std::make_pair(SCALARTYPE(0),true), ctx) {}
+   };
+ 
+   /** @brief Represents a vector consisting of ones only. */
+   template <typename SCALARTYPE>
+   class one_vector : public implicit_vector_base<SCALARTYPE>
+   {
+       typedef implicit_vector_base<SCALARTYPE> base_type;
+     public:
+       typedef typename base_type::size_type size_type;
+       typedef SCALARTYPE        const_reference;
+       one_vector(size_type s, viennacl::context ctx = viennacl::context()) : base_type(s, std::make_pair(SCALARTYPE(1),true), ctx) {}
+   };
+ 
+ 
+   /** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+   template <typename SCALARTYPE>
+   class scalar_vector : public implicit_vector_base<SCALARTYPE>
+   {
+       typedef implicit_vector_base<SCALARTYPE> base_type;
+     public:
+       typedef typename base_type::size_type size_type;
+       typedef SCALARTYPE const & const_reference;
+ 
+       scalar_vector(size_type s, SCALARTYPE val, viennacl::context ctx = viennacl::context()) : base_type(s, std::make_pair(val,false), ctx) {}
+   };
+ 
+ 
+ //#ifdef VIENNACL_WITH_OPENCL
+ //  template<class SCALARTYPE, class DISTRIBUTION>
+ //  rand::random_vector_t<SCALARTYPE, DISTRIBUTION> random_vector(unsigned int size, DISTRIBUTION const & distribution){
+ //      return rand::random_vector_t<SCALARTYPE,DISTRIBUTION>(size,distribution);
+ //  }
+ //#endif
+ 
+ 
+   //
+   // Vector expression
+   //
+ 
+   /** @brief An expression template class that represents a binary operation that yields a vector
+   *
+   * In contrast to full expression templates as introduced by Veldhuizen, ViennaCL does not allow nested expressions.
+   * The reason is that this requires automated GPU viennacl::ocl::kernel generation, which then has to be compiles just-in-time.
+   * For performance-critical applications, one better writes the appropriate viennacl::ocl::kernels by hand.
+   *
+   * Assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
+   *
+   * @tparam LHS   left hand side operand
+   * @tparam RHS   right hand side operand
+   * @tparam OP    the operator
+   */
+   template <typename LHS, typename RHS, typename OP>
+   class vector_expression
+   {
+       typedef typename viennacl::result_of::reference_if_nonscalar<LHS>::type     lhs_reference_type;
+       typedef typename viennacl::result_of::reference_if_nonscalar<RHS>::type     rhs_reference_type;
+ 
+     public:
+       enum { alignment = 1 };
+ 
+       /** @brief Extracts the vector type from the two operands.
+       */
+       typedef vcl_size_t       size_type;
+ 
+       vector_expression(LHS & l, RHS & r) : lhs_(l), rhs_(r) {}
+ 
+       /** @brief Get left hand side operand
+       */
+       lhs_reference_type lhs() const { return lhs_; }
+       /** @brief Get right hand side operand
+       */
+       rhs_reference_type rhs() const { return rhs_; }
+ 
+       /** @brief Returns the size of the result vector */
+       size_type size() const { return viennacl::traits::size(*this); }
+ 
+     private:
+       /** @brief The left hand side operand */
+       lhs_reference_type lhs_;
+       /** @brief The right hand side operand */
+       rhs_reference_type rhs_;
+   };
+ 
+   /** @brief A STL-type const-iterator for vector elements. Elements can be accessed, but cannot be manipulated. VERY SLOW!!
+   *
+   * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
+   * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
+   * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
+   * std::vector<float> temp;
+   * copy(gpu_vector, temp);
+   * for (std::vector<float>::const_iterator iter = temp.begin();
+   *      iter != temp.end();
+   *      ++iter)
+   * {
+   *   //do something
+   * }
+   * Note that you may obtain inconsistent data if entries of gpu_vector are manipulated elsewhere in the meanwhile.
+   *
+   * @tparam SCALARTYPE  The underlying floating point type (either float or double)
+   * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
+   */
+   template<class SCALARTYPE, unsigned int ALIGNMENT>
+   class const_vector_iterator
+   {
+       typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>    self_type;
+     public:
+       typedef scalar<SCALARTYPE>            value_type;
+       typedef vcl_ptrdiff_t                 difference_type;
+       typedef viennacl::backend::mem_handle handle_type;
+ 
+       //const_vector_iterator() {}
+ 
+       /** @brief Constructor
+       *   @param vec    The vector over which to iterate
+       *   @param index  The starting index of the iterator
+       *   @param start  First index of the element in the vector pointed to be the iterator (for vector_range and vector_slice)
+       *   @param stride Stride for the support of vector_slice
+       */
+       const_vector_iterator(vector_base<SCALARTYPE> const & vec,
+                             vcl_size_t index,
+                             vcl_size_t start = 0,
+                             vcl_ptrdiff_t stride = 1) : elements_(vec.handle()), index_(index), start_(start), stride_(stride) {}
+ 
+       /** @brief Constructor for vector-like treatment of arbitrary buffers
+       *   @param elements  The buffer over which to iterate
+       *   @param index     The starting index of the iterator
+       *   @param start     First index of the element in the vector pointed to be the iterator (for vector_range and vector_slice)
+       *   @param stride    Stride for the support of vector_slice
+       */
+       const_vector_iterator(handle_type const & elements,
+                             vcl_size_t index,
+                             vcl_size_t start = 0,
+                             vcl_ptrdiff_t stride = 1) : elements_(elements), index_(index), start_(start), stride_(stride) {}
+ 
+       /** @brief Dereferences the iterator and returns the value of the element. For convenience only, performance is poor due to OpenCL overhead! */
+       value_type operator*(void) const
+       {
+           value_type result;
+           result = const_entry_proxy<SCALARTYPE>(start_ + index_ * stride_, elements_);
+           return result;
+       }
+       self_type operator++(void) { index_ += stride_; return *this; }
+       self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
+ 
+       bool operator==(self_type const & other) const { return index_ == other.index_; }
+       bool operator!=(self_type const & other) const { return index_ != other.index_; }
+ 
+ //        self_type & operator=(self_type const & other)
+ //        {
+ //           index_ = other._index;
+ //           elements_ = other._elements;
+ //           return *this;
+ //        }
+ 
+       difference_type operator-(self_type const & other) const
+       {
+         assert( (other.start_ == start_) && (other.stride_ == stride_) && bool("Iterators are not from the same vector (proxy)!"));
+         return static_cast<difference_type>(index_) - static_cast<difference_type>(other.index_);
+       }
+       self_type operator+(difference_type diff) const { return self_type(elements_, index_ + diff * stride_, start_, stride_); }
+ 
+       //vcl_size_t index() const { return index_; }
+       /** @brief Offset of the current element index with respect to the beginning of the buffer */
+       vcl_size_t offset() const { return start_ + index_ * stride_; }
+ 
+       /** @brief Index increment in the underlying buffer when incrementing the iterator to the next element */
+       vcl_size_t stride() const { return stride_; }
+       handle_type const & handle() const { return elements_; }
+ 
+     protected:
+       /** @brief  The index of the entry the iterator is currently pointing to */
+       handle_type const & elements_;
+       vcl_size_t index_;  //offset from the beginning of elements_
+       vcl_size_t start_;
+       vcl_ptrdiff_t stride_;
+   };
+ 
+ 
+   /** @brief A STL-type iterator for vector elements. Elements can be accessed and manipulated. VERY SLOW!!
+   *
+   * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
+   * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
+   * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
+   * std::vector<float> temp;
+   * copy(gpu_vector, temp);
+   * for (std::vector<float>::const_iterator iter = temp.begin();
+   *      iter != temp.end();
+   *      ++iter)
+   * {
+   *   //do something
+   * }
+   * copy(temp, gpu_vector);
+   * Note that you may obtain inconsistent data if you manipulate entries of gpu_vector in the meanwhile.
+   *
+   * @tparam SCALARTYPE  The underlying floating point type (either float or double)
+   * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
+   */
+   template<class SCALARTYPE, unsigned int ALIGNMENT>
+   class vector_iterator : public const_vector_iterator<SCALARTYPE, ALIGNMENT>
+   {
+       typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>  base_type;
+       typedef vector_iterator<SCALARTYPE, ALIGNMENT>        self_type;
+     public:
+       typedef typename base_type::handle_type               handle_type;
+       typedef typename base_type::difference_type           difference_type;
+ 
+       vector_iterator() : base_type(), elements_(NULL) {}
+       vector_iterator(handle_type & elements,
+                       vcl_size_t index,
+                       vcl_size_t start = 0,
+                       vcl_ptrdiff_t stride = 1)  : base_type(elements, index, start, stride), elements_(elements) {}
+       /** @brief Constructor
+       *   @param vec    The vector over which to iterate
+       *   @param index  The starting index of the iterator
+       *   @param start  Offset from the beginning of the underlying vector (for ranges and slices)
+       *   @param stride Stride for slices
+       */
+       vector_iterator(vector_base<SCALARTYPE> & vec,
+                       vcl_size_t index,
+                       vcl_size_t start = 0,
+                       vcl_ptrdiff_t stride = 1) : base_type(vec, index, start, stride), elements_(vec.handle()) {}
+       //vector_iterator(base_type const & b) : base_type(b) {}
+ 
+       typename base_type::value_type operator*(void)
+       {
+           typename base_type::value_type result;
+           result = entry_proxy<SCALARTYPE>(base_type::start_ + base_type::index_ * base_type::stride_, elements_);
+           return result;
+       }
+ 
+       difference_type operator-(self_type const & other) const { difference_type result = base_type::index_; return (result - static_cast<difference_type>(other.index_)); }
+       self_type operator+(difference_type diff) const { return self_type(elements_, base_type::index_ + diff * base_type::stride_, base_type::start_, base_type::stride_); }
+ 
+       handle_type       & handle()       { return elements_; }
+       handle_type const & handle() const { return base_type::elements_; }
+ 
+       //operator base_type() const
+       //{
+       //  return base_type(base_type::elements_, base_type::index_, base_type::start_, base_type::stride_);
+       //}
+     private:
+       handle_type & elements_;
+   };
+ 
+ 
+   /** @brief Common base class for dense vectors, vector ranges, and vector slices.
+     *
+     * @tparam SCALARTYPE   The floating point type, either 'float' or 'double'
+     */
+   template<class SCALARTYPE, typename SizeType /* see forwards.h for default type */, typename DistanceType /* see forwards.h for default type */>
+   class vector_base
+   {
+       typedef vector_base<SCALARTYPE>         self_type;
+ 
+     public:
+       typedef scalar<SCALARTYPE>                                value_type;
+       typedef SCALARTYPE                                        cpu_value_type;
+       typedef viennacl::backend::mem_handle                     handle_type;
+       typedef SizeType                                          size_type;
+       typedef DistanceType                                      difference_type;
+       typedef const_vector_iterator<SCALARTYPE, 1>              const_iterator;
+       typedef vector_iterator<SCALARTYPE, 1>                    iterator;
+ 
+       static const size_type alignment = 128;
+ 
+       /** @brief Default constructor in order to be compatible with various containers.
+       */
+       explicit vector_base() : size_(0), start_(0), stride_(1), internal_size_(0) { /* Note: One must not call ::init() here because a vector might have been created globally before the backend has become available */ }
+ 
+       /** @brief An explicit constructor for wrapping an existing vector into a vector_range or vector_slice.
+        *
+        *
+        *
+        * @param h          The existing memory handle from a vector/vector_range/vector_slice
+        * @param vec_size   The length (i.e. size) of the buffer
+        * @param vec_start  The offset from the beginning of the buffer identified by 'h'
+        * @param vec_stride Increment between two elements in the original buffer (in multiples of SCALARTYPE)
+       */
+       explicit vector_base(viennacl::backend::mem_handle & h,
+                            size_type vec_size, size_type vec_start, difference_type vec_stride)
+         : size_(vec_size), start_(vec_start), stride_(vec_stride), internal_size_(vec_size), elements_(h) {}
+ 
+       /** @brief Creates a vector and allocates the necessary memory */
+       explicit vector_base(size_type vec_size, viennacl::context ctx = viennacl::context())
+         : size_(vec_size), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, alignment))
+       {
+         if (size_ > 0)
+         {
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), ctx);
+           clear();
+         }
+       }
+ 
+       // CUDA or host memory:
+       explicit vector_base(SCALARTYPE * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start = 0, difference_type stride = 1)
+         : size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+       {
+         if (mem_type == viennacl::CUDA_MEMORY)
+         {
+ #ifdef VIENNACL_WITH_CUDA
+           elements_.switch_active_handle_id(viennacl::CUDA_MEMORY);
+           elements_.cuda_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+           elements_.cuda_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+ #else
+           throw cuda_not_available_exception();
+ #endif
+         }
+         else if (mem_type == viennacl::MAIN_MEMORY)
+         {
+           elements_.switch_active_handle_id(viennacl::MAIN_MEMORY);
+           elements_.ram_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+           elements_.ram_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+         }
+ 
+         elements_.raw_size(sizeof(SCALARTYPE) * vec_size);
+ 
+       }
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+       /** @brief Create a vector from existing OpenCL memory
+       *
+       * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
+       * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+       *
+       * @param existing_mem   An OpenCL handle representing the memory
+       * @param vec_size       The size of the vector.
+       */
+       explicit vector_base(cl_mem existing_mem, size_type vec_size, size_type start = 0, difference_type stride = 1, viennacl::context ctx = viennacl::context())
+         : size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+       {
+         elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+         elements_.opencl_handle() = existing_mem;
+         elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+         elements_.opencl_handle().context(ctx.opencl_context());
+         elements_.raw_size(sizeof(SCALARTYPE) * vec_size);
+       }
+ #endif
+ 
+       /** @brief Creates the vector from the supplied random vector. */
+       /*template<class DISTRIBUTION>
+       vector(rand::random_vector_t<SCALARTYPE, DISTRIBUTION> v) : size_(v.size)
+       {
+         if(size_ > 0)
+         {
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size());
+           rand::buffer_dumper<SCALARTYPE, DISTRIBUTION>::dump(elements_,v.distribution,0,size_);
+         }
+       } */
+ 
+       template <typename LHS, typename RHS, typename OP>
+       explicit vector_base(vector_expression<const LHS, const RHS, OP> const & proxy)
+         : size_(viennacl::traits::size(proxy)), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, alignment))
+       {
+         if (size_ > 0)
+         {
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+           clear();
+         }
+         self_type::operator=(proxy);
+       }
+ 
+ 
+       //
+       // operator=
+       //
+ 
+ 
+       /** @brief Assignment operator. Other vector needs to be of the same size, or this vector is not yet initialized.
+       */
+       self_type & operator=(const self_type & vec)
+       {
+         assert( ( (vec.size() == size()) || (size() == 0) )
+                 && bool("Incompatible vector sizes!"));
+ 
+         if (vec.size() > 0)
+         {
+           if (size_ == 0)
+           {
+             size_ = vec.size();
+             internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+             elements_.switch_active_handle_id(vec.handle().get_active_handle_id());
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(vec));
+             pad();
+           }
+ 
+           viennacl::linalg::av(*this,
+                                vec, cpu_value_type(1.0), 1, false, false);
+         }
+ 
+         return *this;
+       }
+ 
+ 
+       /** @brief Implementation of the operation v1 = v2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+       *
+       * @param proxy  An expression template proxy class.
+       */
+       template <typename LHS, typename RHS, typename OP>
+       self_type & operator=(const vector_expression<const LHS, const RHS, OP> & proxy)
+       {
+         assert( ( (viennacl::traits::size(proxy) == size()) || (size() == 0) )
+                 && bool("Incompatible vector sizes!"));
+ 
+         // initialize the necessary buffer
+         if (size() == 0)
+         {
+           size_ = viennacl::traits::size(proxy);
+           internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+           viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+           pad();
+         }
+ 
+         linalg::detail::op_executor<self_type, op_assign, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+ 
+         return *this;
+       }
+ 
+       // assign vector range or vector slice
+       template <typename T>
+       self_type &
+       operator = (const vector_base<T> & v1)
+       {
+         assert( ( (v1.size() == size()) || (size() == 0) )
+                 && bool("Incompatible vector sizes!"));
+ 
+         if (size() == 0)
+         {
+           size_ = v1.size();
+           if (size_ > 0)
+           {
+             internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(v1));
+             pad();
+           }
+         }
+ 
+         viennacl::linalg::av(*this,
+                              v1, SCALARTYPE(1.0), 1, false, false);
+ 
+         return *this;
+       }
+ 
+       /** @brief Creates the vector from the supplied unit vector. */
+       self_type & operator = (unit_vector<SCALARTYPE> const & v)
+       {
+         assert( ( (v.size() == size()) || (size() == 0) )
+                 && bool("Incompatible vector sizes!"));
+ 
+         if (size() == 0)
+         {
+           size_ = v.size();
+           internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+           if (size_ > 0)
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), v.context());
+             clear();
+           }
+         }
+         else
+           viennacl::linalg::vector_assign(*this, SCALARTYPE(0));
+ 
+         if (size_ > 0)
+           this->operator()(v.index()) = SCALARTYPE(1);
+ 
+         return *this;
+       }
+ 
+       /** @brief Creates the vector from the supplied zero vector. */
+       self_type & operator = (zero_vector<SCALARTYPE> const & v)
+       {
+         assert( ( (v.size() == size()) || (size() == 0) )
+                 && bool("Incompatible vector sizes!"));
+ 
+         if (size() == 0)
+         {
+           size_ = v.size();
+           internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+           if (size_ > 0)
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), v.context());
+             clear();
+           }
+         }
+         else
+           viennacl::linalg::vector_assign(*this, SCALARTYPE(0));
+ 
+         return *this;
+       }
+ 
+       /** @brief Creates the vector from the supplied scalar vector. */
+       self_type & operator = (scalar_vector<SCALARTYPE> const & v)
+       {
+         assert( ( (v.size() == size()) || (size() == 0) )
+                 && bool("Incompatible vector sizes!"));
+ 
+         if (size() == 0)
+         {
+           size_ = v.size();
+           internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+           if (size_ > 0)
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), v.context());
+             pad();
+           }
+         }
+ 
+         if (size_ > 0)
+           viennacl::linalg::vector_assign(*this, v[0]);
+ 
+         return *this;
+       }
+ 
+ 
+ 
+       ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
+ 
+       //Note: The following operator overloads are defined in matrix_operations.hpp, compressed_matrix_operations.hpp and coordinate_matrix_operations.hpp
+       //This is certainly not the nicest approach and will most likely by changed in the future, but it works :-)
+ 
+       //matrix<>
+       /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
+       *
+       * @param proxy An expression template proxy class
+       */
+       template <typename F>
+       self_type & operator=(const viennacl::vector_expression< const matrix_base<SCALARTYPE, F>, const vector_base<SCALARTYPE>, viennacl::op_prod> & proxy)
+       {
+         assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed for v1 = A * v2: size1(A) != size(v1)"));
+ 
+         // check for the special case x = A * x
+         if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
+         {
+           viennacl::vector<SCALARTYPE> result(viennacl::traits::size1(proxy.lhs()));
+           viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+           *this = result;
+         }
+         else
+         {
+           viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+         }
+         return *this;
+       }
+ 
+ 
+       //transposed_matrix_proxy:
+       /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
+       *
+       * @param proxy An expression template proxy class
+       */
+       template <typename F>
+       self_type & operator=(const vector_expression< const matrix_expression< const matrix_base<SCALARTYPE, F>, const matrix_base<SCALARTYPE, F>, op_trans >,
+                                                      const vector_base<SCALARTYPE>,
+                                                      op_prod> & proxy)
+       {
+         assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed in v1 = trans(A) * v2: size2(A) != size(v1)"));
+ 
+         // check for the special case x = trans(A) * x
+         if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
+         {
+           viennacl::vector<SCALARTYPE> result(viennacl::traits::size1(proxy.lhs()));
+           viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+           *this = result;
+         }
+         else
+         {
+           viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+         }
+         return *this;
+       }
+ 
+       ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
+ 
+ 
+       //read-write access to an element of the vector
+       /** @brief Read-write access to a single element of the vector
+       */
+       entry_proxy<SCALARTYPE> operator()(size_type index)
+       {
+         assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+         assert( index < size() && bool("Index out of bounds!") );
+ 
+         return entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+       }
+ 
+       /** @brief Read-write access to a single element of the vector
+       */
+       entry_proxy<SCALARTYPE> operator[](size_type index)
+       {
+         assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+         assert( index < size() && bool("Index out of bounds!") );
+ 
+         return entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+       }
+ 
+ 
+       /** @brief Read access to a single element of the vector
+       */
+       const_entry_proxy<SCALARTYPE> operator()(size_type index) const
+       {
+         assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+         assert( index < size() && bool("Index out of bounds!") );
+ 
+         return const_entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+       }
+ 
+       /** @brief Read access to a single element of the vector
+       */
+       const_entry_proxy<SCALARTYPE> operator[](size_type index) const
+       {
+         assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+         assert( index < size() && bool("Index out of bounds!") );
+ 
+         return const_entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+       }
+ 
+       //
+       // Operator overloads with implicit conversion (thus cannot be made global without introducing additional headache)
+       //
+       self_type & operator += (const self_type & vec)
+       {
+         assert(vec.size() == size() && bool("Incompatible vector sizes!"));
+ 
+         if (size() > 0)
+           viennacl::linalg::avbv(*this,
+                                   *this, SCALARTYPE(1.0), 1, false, false,
+                                   vec,   SCALARTYPE(1.0), 1, false, false);
+         return *this;
+       }
+ 
+       self_type & operator -= (const self_type & vec)
+       {
+         assert(vec.size() == size() && bool("Incompatible vector sizes!"));
+ 
+         if (size() > 0)
+           viennacl::linalg::avbv(*this,
+                                   *this, SCALARTYPE(1.0),  1, false, false,
+                                   vec,   SCALARTYPE(-1.0), 1, false, false);
+         return *this;
+       }
+ 
+       template <typename LHS, typename RHS, typename OP>
+       self_type & operator += (const vector_expression<const LHS, const RHS, OP> & proxy)
+       {
+         assert( (viennacl::traits::size(proxy) == size()) && bool("Incompatible vector sizes!"));
+         assert( (size() > 0) && bool("Vector not yet initialized!") );
+ 
+         linalg::detail::op_executor<self_type, op_inplace_add, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+ 
+         return *this;
+       }
+ 
+       template <typename LHS, typename RHS, typename OP>
+       self_type & operator -= (const vector_expression<const LHS, const RHS, OP> & proxy)
+       {
+         assert( (viennacl::traits::size(proxy) == size()) && bool("Incompatible vector sizes!"));
+         assert( (size() > 0) && bool("Vector not yet initialized!") );
+ 
+         linalg::detail::op_executor<self_type, op_inplace_sub, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+ 
+         return *this;
+       }
+ 
+       /** @brief Scales a vector (or proxy) by a CPU scalar value
+       */
+       self_type & operator *= (SCALARTYPE val)
+       {
+         if (size() > 0)
+           viennacl::linalg::av(*this,
+                                 *this, val, 1, false, false);
+         return *this;
+       }
+ 
+       /** @brief Scales this vector by a CPU scalar value
+       */
+       self_type & operator /= (SCALARTYPE val)
+       {
+         if (size() > 0)
+           viennacl::linalg::av(*this,
+                                *this, val, 1, true, false);
+         return *this;
+       }
+ 
+ 
+       /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
+       */
+       vector_expression< const self_type, const SCALARTYPE, op_mult>
+       operator * (SCALARTYPE value) const
+       {
+         return vector_expression< const self_type, const SCALARTYPE, op_mult>(*this, value);
+       }
+ 
+ 
+       /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
+       */
+       vector_expression< const self_type, const SCALARTYPE, op_div>
+       operator / (SCALARTYPE value) const
+       {
+         return vector_expression< const self_type, const SCALARTYPE, op_div>(*this, value);
+       }
+ 
+ 
+       /** @brief Sign flip for the vector. Emulated to be equivalent to -1.0 * vector */
+       vector_expression<const self_type, const SCALARTYPE, op_mult> operator-() const
+       {
+         return vector_expression<const self_type, const SCALARTYPE, op_mult>(*this, SCALARTYPE(-1.0));
+       }
+ 
+       //
+       //// iterators:
+       //
+ 
+       /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
+       iterator begin()
+       {
+         return iterator(*this, 0, start_, stride_);
+       }
+ 
+       /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
+       iterator end()
+       {
+         return iterator(*this, size(), start_, stride_);
+       }
+ 
+       /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
+       const_iterator begin() const
+       {
+         return const_iterator(*this, 0, start_, stride_);
+       }
+ 
+       /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
+       const_iterator end() const
+       {
+         return const_iterator(*this, size(), start_, stride_);
+       }
+ 
+       /** @brief Swaps the entries of the two vectors
+       */
+       self_type & swap(self_type & other)
+       {
+         viennacl::linalg::vector_swap(*this, other);
+         return *this;
+       };
+ 
+ 
+       /** @brief Returns the length of the vector (cf. std::vector)
+       */
+       size_type size() const { return size_; }
+ 
+       /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'ALIGNMENT'
+       */
+       size_type internal_size() const { return internal_size_; }
+ 
+       /** @brief Returns the offset within the buffer
+       */
+       size_type start() const { return start_; }
+ 
+       /** @brief Returns the stride within the buffer (in multiples of sizeof(SCALARTYPE))
+       */
+       size_type stride() const { return stride_; }
+ 
+ 
+       /** @brief Returns true is the size is zero */
+       bool empty() const { return size_ == 0; }
+ 
+       /** @brief Returns the memory handle. */
+       const handle_type & handle() const { return elements_; }
+ 
+       /** @brief Returns the memory handle. */
+       handle_type & handle() { return elements_; }
+ 
+       /** @brief Resets all entries to zero. Does not change the size of the vector.
+       */
+       void clear()
+       {
+         viennacl::linalg::vector_assign(*this, cpu_value_type(0.0), true);
+       }
+ 
+       viennacl::memory_types memory_domain() const
+       {
+         return elements_.get_active_handle_id();
+       }
+ 
+     protected:
+ 
+       void set_handle(viennacl::backend::mem_handle const & h)
+       {
+         elements_ = h;
+       }
+ 
+       /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
+       */
+       self_type & fast_swap(self_type & other)
+       {
+         assert(this->size_ == other.size_ && bool("Vector size mismatch"));
+         this->elements_.swap(other.elements_);
+         return *this;
+       }
+ 
+       /** @brief Pads vectors with alignment > 1 with trailing zeros if the internal size is larger than the visible size */
+       void pad()
+       {
+         if (internal_size() != size())
+         {
+           std::vector<SCALARTYPE> pad(internal_size() - size());
+           viennacl::backend::memory_write(elements_, sizeof(SCALARTYPE) * size(), sizeof(SCALARTYPE) * pad.size(), &(pad[0]));
+         }
+       }
+ 
+       void switch_memory_context(viennacl::context new_ctx)
+       {
+         viennacl::backend::switch_memory_context<SCALARTYPE>(elements_, new_ctx);
+       }
+ 
+       //TODO: Think about implementing the following public member functions
+       //void insert_element(unsigned int i, SCALARTYPE val){}
+       //void erase_element(unsigned int i){}
+ 
+       //enlarge or reduce allocated memory and set unused memory to zero
+       /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
+       *
+       *  @param new_size  The new size of the vector
+       *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+       */
+       void resize(size_type new_size, bool preserve = true)
+       {
+         resize_impl(new_size, viennacl::traits::context(*this), preserve);
+       }
+ 
+       /** @brief Resizes the allocated memory for the vector. Convenience function for setting an OpenCL context in case reallocation is needed
+       *
+       *  @param new_size  The new size of the vector
+       *  @param ctx       The context within which the new memory should be allocated
+       *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+       */
+       void resize(size_type new_size, viennacl::context ctx, bool preserve = true)
+       {
+         resize_impl(new_size, ctx, preserve);
+       }
+ 
+     private:
+ 
+       void resize_impl(size_type new_size, viennacl::context ctx, bool preserve = true)
+       {
+         assert(new_size > 0 && bool("Positive size required when resizing vector!"));
+ 
+         if (new_size != size_)
+         {
+           vcl_size_t new_internal_size = viennacl::tools::align_to_multiple<vcl_size_t>(new_size, alignment);
+ 
+           std::vector<SCALARTYPE> temp(size_);
+           if (preserve && size_ > 0)
+             fast_copy(*this, temp);
+           temp.resize(new_size);  //drop all entries above new_size
+           temp.resize(new_internal_size); //enlarge to fit new internal size
+ 
+           if (new_internal_size != internal_size())
+           {
+             viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*new_internal_size, ctx, NULL);
+           }
+ 
+           fast_copy(temp, *this);
+           size_ = new_size;
+           internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+           pad();
+         }
+ 
+       }
+ 
+       size_type       size_;
+       size_type       start_;
+       difference_type stride_;
+       size_type       internal_size_;
+       handle_type elements_;
+   }; //vector_base
+ 
+ 
+ 
+   // forward definition in forwards.h!
+   /** @brief A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::vector
+   *
+   *  This is the basic vector type of ViennaCL. It is similar to std::vector and boost::numeric::ublas::vector and supports various linear algebra operations.
+   * By default, the internal length of the vector is padded to a multiple of 'ALIGNMENT' in order to speed up several GPU viennacl::ocl::kernels.
+   *
+   * @tparam SCALARTYPE  The floating point type, either 'float' or 'double'
+   * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+   */
+   template<class SCALARTYPE, unsigned int ALIGNMENT>
+   class vector : public vector_base<SCALARTYPE>
+   {
+     typedef vector<SCALARTYPE, ALIGNMENT>         self_type;
+     typedef vector_base<SCALARTYPE>               base_type;
+ 
+   public:
+     typedef typename base_type::size_type                  size_type;
+     typedef typename base_type::difference_type            difference_type;
+ 
+     /** @brief Default constructor in order to be compatible with various containers.
+     */
+     explicit vector() : base_type() { /* Note: One must not call ::init() here because the vector might have been created globally before the backend has become available */ }
+ 
+     /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT')
+     *
+     * @param vec_size   The length (i.e. size) of the vector.
+     */
+     explicit vector(size_type vec_size) : base_type(vec_size) {}
+ 
+     explicit vector(size_type vec_size, viennacl::context ctx) : base_type(vec_size, ctx) {}
+ 
+     explicit vector(SCALARTYPE * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, size_type start = 0, difference_type stride = 1)
+         : base_type(ptr_to_mem, mem_type, vec_size, start, stride) {}
+ 
+ #ifdef VIENNACL_WITH_OPENCL
+     /** @brief Create a vector from existing OpenCL memory
+     *
+     * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
+     * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+     *
+     * @param existing_mem   An OpenCL handle representing the memory
+     * @param vec_size       The size of the vector.
+     */
+     explicit vector(cl_mem existing_mem, size_type vec_size, size_type start = 0, difference_type stride = 1) : base_type(existing_mem, vec_size, start, stride) {}
+ 
+     /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT') and the OpenCL context provided
+     *
+     * @param vec_size   The length (i.e. size) of the vector.
+     * @param ctx        The context
+     */
+     explicit vector(size_type vec_size, viennacl::ocl::context const & ctx) : base_type(vec_size, ctx) {}
+ #endif
+ 
+     template <typename LHS, typename RHS, typename OP>
+     vector(vector_expression<const LHS, const RHS, OP> const & proxy) : base_type(proxy) {}
+ 
+     vector(const base_type & v) : base_type(v.size(), viennacl::traits::context(v))
+     {
+       if (v.size() > 0)
+         base_type::operator=(v);
+     }
+ 
+     vector(const self_type & v) : base_type(v.size(), viennacl::traits::context(v))
+     {
+       if (v.size() > 0)
+         base_type::operator=(v);
+     }
+ 
+     /** @brief Creates the vector from the supplied unit vector. */
+     vector(unit_vector<SCALARTYPE> const & v) : base_type(v.size())
+     {
+       if (v.size() > 0)
+         this->operator()(v.index()) = SCALARTYPE(1);;
+     }
+ 
+     /** @brief Creates the vector from the supplied zero vector. */
+     vector(zero_vector<SCALARTYPE> const & v) : base_type(v.size(), v.context())
+     {
+       if (v.size() > 0)
+         viennacl::linalg::vector_assign(*this, SCALARTYPE(0.0));
+     }
+ 
+     /** @brief Creates the vector from the supplied scalar vector. */
+     vector(scalar_vector<SCALARTYPE> const & v) : base_type(v.size(), v.context())
+     {
+       if (v.size() > 0)
+         viennacl::linalg::vector_assign(*this, v[0]);
+     }
+ 
+     // the following is used to circumvent an issue with Clang 3.0 when 'using base_type::operator=;' directly
+     template <typename T>
+     self_type & operator=(T const & other)
+     {
+       base_type::operator=(other);
+       return *this;
+     }
+ 
+     using base_type::operator+=;
+     using base_type::operator-=;
+ 
+     //enlarge or reduce allocated memory and set unused memory to zero
+     /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
+     *
+     *  @param new_size  The new size of the vector
+     *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+     */
+     void resize(size_type new_size, bool preserve = true)
+     {
+       base_type::resize(new_size, preserve);
+     }
+ 
+     void resize(size_type new_size, viennacl::context ctx, bool preserve = true)
+     {
+       base_type::resize(new_size, ctx, preserve);
+     }
+ 
+     /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
+     */
+     self_type & fast_swap(self_type & other)
+     {
+       base_type::fast_swap(other);
+       return *this;
+     }
+ 
+     void switch_memory_context(viennacl::context new_ctx)
+     {
+       base_type::switch_memory_context(new_ctx);
+     }
+ 
+   }; //vector
+ 
+   /** @brief Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from viennacl::tie(). */
+   template <typename ScalarT>
+   class vector_tuple
+   {
+     typedef vector_base<ScalarT>   VectorType;
+ 
+   public:
+       // 2 vectors
+ 
+       vector_tuple(VectorType const & v0, VectorType const & v1) : const_vectors_(2), non_const_vectors_()
+       {
+         const_vectors_[0] = &v0;
+         const_vectors_[1] = &v1;
+       }
+       vector_tuple(VectorType       & v0, VectorType       & v1) : const_vectors_(2), non_const_vectors_(2)
+       {
+         const_vectors_[0] = &v0; non_const_vectors_[0] = &v0;
+         const_vectors_[1] = &v1; non_const_vectors_[1] = &v1;
+       }
+ 
+       // 3 vectors
+ 
+       vector_tuple(VectorType const & v0, VectorType const & v1, VectorType const & v2) : const_vectors_(3), non_const_vectors_()
+       {
+         const_vectors_[0] = &v0;
+         const_vectors_[1] = &v1;
+         const_vectors_[2] = &v2;
+       }
+       vector_tuple(VectorType       & v0, VectorType       & v1, VectorType       & v2) : const_vectors_(3), non_const_vectors_(3)
+       {
+         const_vectors_[0] = &v0; non_const_vectors_[0] = &v0;
+         const_vectors_[1] = &v1; non_const_vectors_[1] = &v1;
+         const_vectors_[2] = &v2; non_const_vectors_[2] = &v2;
+       }
+ 
+       // 4 vectors
+ 
+       vector_tuple(VectorType const & v0, VectorType const & v1, VectorType const & v2, VectorType const & v3) : const_vectors_(4), non_const_vectors_()
+       {
+         const_vectors_[0] = &v0;
+         const_vectors_[1] = &v1;
+         const_vectors_[2] = &v2;
+         const_vectors_[3] = &v3;
+       }
+       vector_tuple(VectorType       & v0, VectorType       & v1, VectorType       & v2, VectorType       & v3) : const_vectors_(4), non_const_vectors_(4)
+       {
+         const_vectors_[0] = &v0; non_const_vectors_[0] = &v0;
+         const_vectors_[1] = &v1; non_const_vectors_[1] = &v1;
+         const_vectors_[2] = &v2; non_const_vectors_[2] = &v2;
+         const_vectors_[3] = &v3; non_const_vectors_[3] = &v3;
+       }
+ 
+       // add more overloads here
+ 
+       // generic interface:
+ 
+       vector_tuple(std::vector<VectorType const *> const & vecs) : const_vectors_(vecs.size()), non_const_vectors_()
+       {
+         for (vcl_size_t i=0; i<vecs.size(); ++i)
+           const_vectors_[i] = vecs[i];
+       }
+ 
+       vector_tuple(std::vector<VectorType *> const & vecs) : const_vectors_(vecs.size()), non_const_vectors_(vecs.size())
+       {
+         for (vcl_size_t i=0; i<vecs.size(); ++i)
+         {
+               const_vectors_[i] = vecs[i];
+           non_const_vectors_[i] = vecs[i];
+         }
+       }
+ 
+       vcl_size_t size()       const { return non_const_vectors_.size(); }
+       vcl_size_t const_size() const { return     const_vectors_.size(); }
+ 
+       VectorType       &       at(vcl_size_t i) const { return *(non_const_vectors_.at(i)); }
+       VectorType const & const_at(vcl_size_t i) const { return     *(const_vectors_.at(i)); }
+ 
+   private:
+     std::vector<VectorType const *>   const_vectors_;
+     std::vector<VectorType *>         non_const_vectors_;
+   };
+ 
+   // 2 args
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1) { return vector_tuple<ScalarT>(v0, v1); }
+ 
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1) { return vector_tuple<ScalarT>(v0, v1); }
+ 
+   // 3 args
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1, vector_base<ScalarT> const & v2) { return vector_tuple<ScalarT>(v0, v1, v2); }
+ 
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1, vector_base<ScalarT>       & v2) { return vector_tuple<ScalarT>(v0, v1, v2); }
+ 
+   // 4 args
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1, vector_base<ScalarT> const & v2, vector_base<ScalarT> const & v3)
+   {
+     return vector_tuple<ScalarT>(v0, v1, v2, v3);
+   }
+ 
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1, vector_base<ScalarT>       & v2, vector_base<ScalarT>       & v3)
+   {
+     return vector_tuple<ScalarT>(v0, v1, v2, v3);
+   }
+ 
+   // 5 args
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0,
+                             vector_base<ScalarT> const & v1,
+                             vector_base<ScalarT> const & v2,
+                             vector_base<ScalarT> const & v3,
+                             vector_base<ScalarT> const & v4)
+   {
+     typedef vector_base<ScalarT> const *       VectorPointerType;
+     std::vector<VectorPointerType> vec(5);
+     vec[0] = &v0;
+     vec[1] = &v1;
+     vec[2] = &v2;
+     vec[3] = &v3;
+     vec[4] = &v4;
+     return vector_tuple<ScalarT>(vec);
+   }
+ 
+   template <typename ScalarT>
+   vector_tuple<ScalarT> tie(vector_base<ScalarT> & v0,
+                             vector_base<ScalarT> & v1,
+                             vector_base<ScalarT> & v2,
+                             vector_base<ScalarT> & v3,
+                             vector_base<ScalarT> & v4)
+   {
+     typedef vector_base<ScalarT> *       VectorPointerType;
+     std::vector<VectorPointerType> vec(5);
+     vec[0] = &v0;
+     vec[1] = &v1;
+     vec[2] = &v2;
+     vec[3] = &v3;
+     vec[4] = &v4;
+     return vector_tuple<ScalarT>(vec);
+   }
+ 
+   // TODO: Add more arguments to tie() here. Maybe use some preprocessor magic to accomplish this.
+ 
+   //
+   //////////////////// Copy from GPU to CPU //////////////////////////////////
+   //
+ 
+ 
+   /** @brief STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
+   *
+   * This method is faster than the plain copy() function, because entries are
+   * directly written to the cpu vector, starting with &(*cpu.begin()) However,
+   * keep in mind that the cpu type MUST represent a linear piece of
+   * memory, otherwise you will run into undefined behavior.
+   *
+   * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
+   * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+   void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+                   const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+                   CPU_ITERATOR cpu_begin )
+   {
+     if (gpu_begin != gpu_end)
+     {
+       if (gpu_begin.stride() == 1)
+       {
+         viennacl::backend::memory_read(gpu_begin.handle(),
+                                       sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                       sizeof(SCALARTYPE)*gpu_begin.stride() * (gpu_end - gpu_begin),
+                                       &(*cpu_begin));
+       }
+       else
+       {
+         vcl_size_t gpu_size = (gpu_end - gpu_begin);
+         std::vector<SCALARTYPE> temp_buffer(gpu_begin.stride() * gpu_size);
+         viennacl::backend::memory_read(gpu_begin.handle(), sizeof(SCALARTYPE)*gpu_begin.offset(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+ 
+         for (vcl_size_t i=0; i<gpu_size; ++i)
+         {
+           (&(*cpu_begin))[i] = temp_buffer[i * gpu_begin.stride()];
+         }
+       }
+     }
+   }
+ 
+   /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+   *
+   * @param gpu_vec    A gpu vector.
+   * @param cpu_vec    The cpu vector. Type requirements: Output iterator pointing to entries linear in memory can be obtained via member function .begin()
+   */
+   template <typename NumericT, typename CPUVECTOR>
+   void fast_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+   {
+     viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+   }
+ 
+ 
+   /** @brief Asynchronous version of fast_copy(), copying data from device to host. The host iterator cpu_begin needs to reside in a linear piece of memory, such as e.g. for std::vector.
+   *
+   * This method allows for overlapping data transfer with host computation and returns immediately if the gpu vector has a unit-stride.
+   * In order to wait for the transfer to complete, use viennacl::backend::finish().
+   * Note that data pointed to by cpu_begin must not be modified prior to completion of the transfer.
+   *
+   * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
+   * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+   void async_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+                   const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+                   CPU_ITERATOR cpu_begin )
+   {
+     if (gpu_begin != gpu_end)
+     {
+       if (gpu_begin.stride() == 1)
+       {
+         viennacl::backend::memory_read(gpu_begin.handle(),
+                                        sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                        sizeof(SCALARTYPE)*gpu_begin.stride() * (gpu_end - gpu_begin),
+                                        &(*cpu_begin),
+                                        true);
+       }
+       else // no async copy possible, so fall-back to fast_copy
+         fast_copy(gpu_begin, gpu_end, cpu_begin);
+     }
+   }
+ 
+   /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+   *
+   * @param gpu_vec    A gpu vector.
+   * @param cpu_vec    The cpu vector. Type requirements: Output iterator pointing to entries linear in memory can be obtained via member function .begin()
+   */
+   template <typename NumericT, typename CPUVECTOR>
+   void async_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+   {
+     viennacl::async_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+   }
+ 
+ 
+   /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
+   *
+   * @param gpu_begin  GPU constant iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param gpu_end    GPU constant iterator pointing to the end of the vector (STL-like)
+   * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+   void copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+             const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+             CPU_ITERATOR cpu_begin )
+   {
+     assert(gpu_end - gpu_begin >= 0 && bool("Iterators incompatible"));
+     if (gpu_end - gpu_begin != 0)
+     {
+       std::vector<SCALARTYPE> temp_buffer(gpu_end - gpu_begin);
+       fast_copy(gpu_begin, gpu_end, temp_buffer.begin());
+ 
+       //now copy entries to cpu_vec:
+       std::copy(temp_buffer.begin(), temp_buffer.end(), cpu_begin);
+     }
+   }
+ 
+   /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
+   *
+   * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
+   * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+   void copy(const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+             const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+             CPU_ITERATOR cpu_begin )
+ 
+   {
+     viennacl::copy(const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin),
+                     const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_end),
+                     cpu_begin);
+   }
+ 
+   /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+   *
+   * @param gpu_vec    A gpu vector
+   * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
+   */
+   template <typename NumericT, typename CPUVECTOR>
+   void copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+   {
+     viennacl::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+   }
+ 
+ 
+ 
+   #ifdef VIENNACL_WITH_EIGEN
+   template <unsigned int ALIGNMENT>
+   void copy(vector<float, ALIGNMENT> const & gpu_vec,
+             Eigen::VectorXf & eigen_vec)
+   {
+     viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
+   }
+ 
+   template <unsigned int ALIGNMENT>
+   void copy(vector<double, ALIGNMENT> & gpu_vec,
+             Eigen::VectorXd & eigen_vec)
+   {
+     viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
+   }
+   #endif
+ 
+ 
+   //
+   //////////////////// Copy from CPU to GPU //////////////////////////////////
+   //
+ 
+   /** @brief STL-like transfer of a CPU vector to the GPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
+   *
+   * This method is faster than the plain copy() function, because entries are
+   * directly read from the cpu vector, starting with &(*cpu.begin()). However,
+   * keep in mind that the cpu type MUST represent a linear piece of
+   * memory, otherwise you will run into undefined behavior.
+   *
+   * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
+   * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
+   * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
+   */
+   template <typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+   void fast_copy(CPU_ITERATOR const & cpu_begin,
+                   CPU_ITERATOR const & cpu_end,
+                   vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
+   {
+     if (cpu_end - cpu_begin > 0)
+     {
+       if (gpu_begin.stride() == 1)
+       {
+         viennacl::backend::memory_write(gpu_begin.handle(),
+                                         sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                         sizeof(SCALARTYPE)*gpu_begin.stride() * (cpu_end - cpu_begin), &(*cpu_begin));
+       }
+       else //writing to slice:
+       {
+         vcl_size_t cpu_size = (cpu_end - cpu_begin);
+         std::vector<SCALARTYPE> temp_buffer(gpu_begin.stride() * cpu_size);
+ 
+         viennacl::backend::memory_read(gpu_begin.handle(), sizeof(SCALARTYPE)*gpu_begin.offset(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+ 
+         for (vcl_size_t i=0; i<cpu_size; ++i)
+           temp_buffer[i * gpu_begin.stride()] = (&(*cpu_begin))[i];
+ 
+         viennacl::backend::memory_write(gpu_begin.handle(), sizeof(SCALARTYPE)*gpu_begin.offset(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+       }
+     }
+   }
+ 
+ 
+   /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+   *
+   * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+   * @param gpu_vec    The gpu vector.
+   */
+   template <typename CPUVECTOR, typename NumericT>
+   void fast_copy(const CPUVECTOR & cpu_vec, vector_base<NumericT> & gpu_vec)
+   {
+     viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+   }
+ 
+   /** @brief Asynchronous version of fast_copy(), copying data from host to device. The host iterator cpu_begin needs to reside in a linear piece of memory, such as e.g. for std::vector.
+   *
+   * This method allows for overlapping data transfer with host computation and returns immediately if the gpu vector has a unit-stride.
+   * In order to wait for the transfer to complete, use viennacl::backend::finish().
+   * Note that data pointed to by cpu_begin must not be modified prior to completion of the transfer.
+   *
+   * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
+   * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
+   * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
+   */
+   template <typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+   void async_copy(CPU_ITERATOR const & cpu_begin,
+                   CPU_ITERATOR const & cpu_end,
+                   vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
+   {
+     if (cpu_end - cpu_begin > 0)
+     {
+       if (gpu_begin.stride() == 1)
+       {
+         viennacl::backend::memory_write(gpu_begin.handle(),
+                                         sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                         sizeof(SCALARTYPE)*gpu_begin.stride() * (cpu_end - cpu_begin), &(*cpu_begin),
+                                         true);
+       }
+       else // fallback to blocking copy. There's nothing we can do to prevent this
+         fast_copy(cpu_begin, cpu_end, gpu_begin);
+     }
+   }
+ 
+ 
+   /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+   *
+   * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+   * @param gpu_vec    The gpu vector.
+   */
+   template <typename CPUVECTOR, typename NumericT>
+   void async_copy(const CPUVECTOR & cpu_vec, vector_base<NumericT> & gpu_vec)
+   {
+     viennacl::async_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+   }
+ 
+   //from cpu to gpu. Safe assumption: cpu_vector does not necessarily occupy a linear memory segment, but is not larger than the allocated memory on the GPU
+   /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
+   *
+   * @param cpu_begin  CPU iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
+   * @param gpu_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+   void copy(CPU_ITERATOR const & cpu_begin,
+             CPU_ITERATOR const & cpu_end,
+             vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
+   {
+     assert(cpu_end - cpu_begin > 0 && bool("Iterators incompatible"));
+     if (cpu_begin != cpu_end)
+     {
+       //we require that the size of the gpu_vector is larger or equal to the cpu-size
+       std::vector<SCALARTYPE> temp_buffer(cpu_end - cpu_begin);
+       std::copy(cpu_begin, cpu_end, temp_buffer.begin());
+       viennacl::fast_copy(temp_buffer.begin(), temp_buffer.end(), gpu_begin);
+     }
+   }
+ 
+   // for things like copy(std_vec.begin(), std_vec.end(), vcl_vec.begin() + 1);
+ 
+   /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+   *
+   * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+   * @param gpu_vec    The gpu vector.
+   */
+   template <typename CPUVECTOR, typename T>
+   void copy(const CPUVECTOR & cpu_vec, vector_base<T> & gpu_vec)
+   {
+     viennacl::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+   }
+ 
+ 
+   #ifdef VIENNACL_WITH_EIGEN
+   template <unsigned int ALIGNMENT>
+   void copy(Eigen::VectorXf const & eigen_vec,
+             vector<float, ALIGNMENT> & gpu_vec)
+   {
+     std::vector<float> entries(eigen_vec.size());
+     for (vcl_size_t i = 0; i<entries.size(); ++i)
+       entries[i] = eigen_vec(i);
+     viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
+   }
+ 
+   template <unsigned int ALIGNMENT>
+   void copy(Eigen::VectorXd const & eigen_vec,
+             vector<double, ALIGNMENT> & gpu_vec)
+   {
+     std::vector<double> entries(eigen_vec.size());
+     for (vcl_size_t i = 0; i<entries.size(); ++i)
+       entries[i] = eigen_vec(i);
+     viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
+   }
+   #endif
+ 
+ 
+ 
+   //
+   //////////////////// Copy from GPU to GPU //////////////////////////////////
+   //
+   /** @brief Copy (parts of a) GPU vector to another GPU vector
+   *
+   * @param gpu_src_begin    GPU iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param gpu_src_end      GPU iterator pointing to the end of the vector (STL-like)
+   * @param gpu_dest_begin   Output iterator for the gpu vector. The gpu_dest vector must be at least as long as the gpu_src vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+   void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+             const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+             vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
+   {
+     assert(gpu_src_end - gpu_src_begin >= 0);
+     assert(gpu_src_begin.stride() == 1 && bool("ViennaCL ERROR: copy() for GPU->GPU not implemented for slices! Use operator= instead for the moment."));
+ 
+     if (gpu_src_begin.stride() == 1 && gpu_dest_begin.stride() == 1)
+     {
+       if (gpu_src_begin != gpu_src_end)
+         viennacl::backend::memory_copy(gpu_src_begin.handle(), gpu_dest_begin.handle(),
+                                         sizeof(SCALARTYPE) * gpu_src_begin.offset(),
+                                         sizeof(SCALARTYPE) * gpu_dest_begin.offset(),
+                                         sizeof(SCALARTYPE) * (gpu_src_end.offset() - gpu_src_begin.offset()));
+     }
+     else
+     {
+       assert( false && bool("not implemented yet"));
+     }
+   }
+ 
+   /** @brief Copy (parts of a) GPU vector to another GPU vector
+   *
+   * @param gpu_src_begin   GPU iterator pointing to the beginning of the gpu vector (STL-like)
+   * @param gpu_src_end     GPU iterator pointing to the end of the vector (STL-like)
+   * @param gpu_dest_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+   void copy(vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+             vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+             vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
+   {
+     viennacl::copy(static_cast<const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> >(gpu_src_begin),
+                     static_cast<const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> >(gpu_src_end),
+                     gpu_dest_begin);
+   }
+ 
+   /** @brief Transfer from a ViennaCL vector to another ViennaCL vector. Convenience wrapper for viennacl::linalg::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
+   *
+   * @param gpu_src_vec    A gpu vector
+   * @param gpu_dest_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+   void copy(vector<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_vec,
+             vector<SCALARTYPE, ALIGNMENT_DEST> & gpu_dest_vec )
+   {
+     viennacl::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
+   }
+ 
+ 
+ 
+ 
+ 
+ 
+   //global functions for handling vectors:
+   /** @brief Output stream. Output format is ublas compatible.
+   * @param os   STL output stream
+   * @param val  The vector that should be printed
+   */
+   template <typename T>
+   std::ostream & operator<<(std::ostream & os, vector_base<T> const & val)
+   {
+     std::vector<T> tmp(val.size());
+     viennacl::copy(val.begin(), val.end(), tmp.begin());
+     os << "[" << val.size() << "](";
+     for (typename std::vector<T>::size_type i=0; i<val.size(); ++i)
+     {
+       if (i > 0)
+         os << ",";
+       os << tmp[i];
+     }
+     os << ")";
+     return os;
+   }
+ 
+   template <typename LHS, typename RHS, typename OP>
+   std::ostream & operator<<(std::ostream & os, vector_expression<LHS, RHS, OP> const & proxy)
+ 
+   {
+     typedef typename viennacl::result_of::cpu_value_type<typename LHS::value_type>::type ScalarType;
+     viennacl::vector<ScalarType> result = proxy;
+     os << result;
+     return os;
+   }
+ 
+   /** @brief Swaps the contents of two vectors, data is copied
+   *
+   * @param vec1   The first vector
+   * @param vec2   The second vector
+   */
+   template <typename T>
+   void swap(vector_base<T> & vec1, vector_base<T> & vec2)
+   {
+     viennacl::linalg::vector_swap(vec1, vec2);
+   }
+ 
+   /** @brief Swaps the content of two vectors by swapping OpenCL handles only, NO data is copied
+   *
+   * @param v1   The first vector
+   * @param v2   The second vector
+   */
+   template <typename SCALARTYPE, unsigned int ALIGNMENT>
+   vector<SCALARTYPE, ALIGNMENT> & fast_swap(vector<SCALARTYPE, ALIGNMENT> & v1,
+                                             vector<SCALARTYPE, ALIGNMENT> & v2)
+   {
+     return v1.fast_swap(v2);
+   }
+ 
+ 
+ 
+ 
+ 
+   //
+   //
+   ////////// operations /////////////////////////////////////////////////////////////////////////////////
+   //
+   //
+ 
+ 
+   //
+   // operator *=
+   //
+ 
+   /** @brief Scales this vector by a GPU scalar value
+   */
+   template <typename T, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 vector_base<T> &
+                               >::type
+   operator *= (vector_base<T> & v1, S1 const & gpu_val)
+   {
+     if (v1.size() > 0)
+       viennacl::linalg::av(v1,
+                            v1, gpu_val, 1, false, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+     return v1;
+   }
+ 
+ 
+   //
+   // operator /=
+   //
+ 
+ 
+   /** @brief Scales this vector by a GPU scalar value
+   */
+   template <typename T, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 vector_base<T> &
+                               >::type
+   operator /= (vector_base<T> & v1, S1 const & gpu_val)
+   {
+     if (v1.size() > 0)
+       viennacl::linalg::av(v1,
+                            v1, gpu_val, 1, true, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+     return v1;
+   }
+ 
+ 
+   //
+   // operator +
+   //
+ 
+ 
+   /** @brief Operator overload for the addition of two vector expressions.
+   *
+   * @param proxy1  Left hand side vector expression
+   * @param proxy2  Right hand side vector expression
+   */
+   template <typename LHS1, typename RHS1, typename OP1,
+             typename LHS2, typename RHS2, typename OP2>
+   vector_expression< const vector_expression< LHS1, RHS1, OP1>,
+                      const vector_expression< LHS2, RHS2, OP2>,
+                      viennacl::op_add>
+   operator + (vector_expression<LHS1, RHS1, OP1> const & proxy1,
+               vector_expression<LHS2, RHS2, OP2> const & proxy2)
+   {
+     assert(proxy1.size() == proxy2.size() && bool("Incompatible vector sizes!"));
+     return   vector_expression< const vector_expression<LHS1, RHS1, OP1>,
+                                 const vector_expression<LHS2, RHS2, OP2>,
+                                 viennacl::op_add>(proxy1, proxy2);
+   }
+ 
+   /** @brief Operator overload for the addition of a vector expression with a vector or another vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+   *
+   * @param proxy   Left hand side vector expression
+   * @param vec     Right hand side vector (also -range and -slice is allowed)
+   */
+   template <typename LHS, typename RHS, typename OP, typename T>
+   vector_expression< const vector_expression<LHS, RHS, OP>,
+                      const vector_base<T>,
+                      viennacl::op_add>
+   operator + (vector_expression<LHS, RHS, OP> const & proxy,
+               vector_base<T> const & vec)
+   {
+     assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+     return vector_expression< const vector_expression<LHS, RHS, OP>,
+                               const vector_base<T>,
+                               viennacl::op_add>(proxy, vec);
+   }
+ 
+   /** @brief Operator overload for the addition of a vector with a vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+   *
+   * @param proxy   Left hand side vector expression
+   * @param vec     Right hand side vector (also -range and -slice is allowed)
+   */
+   template <typename T, typename LHS, typename RHS, typename OP>
+   vector_expression< const vector_base<T>,
+                      const vector_expression<LHS, RHS, OP>,
+                      viennacl::op_add>
+   operator + (vector_base<T> const & vec,
+               vector_expression<LHS, RHS, OP> const & proxy)
+   {
+     assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+     return vector_expression< const vector_base<T>,
+                               const vector_expression<LHS, RHS, OP>,
+                               viennacl::op_add>(vec, proxy);
+   }
+ 
+   /** @brief Returns an expression template object for adding up two vectors, i.e. v1 + v2
+   */
+   template <typename T>
+   vector_expression< const vector_base<T>, const vector_base<T>, op_add>
+   operator + (const vector_base<T> & v1, const vector_base<T> & v2)
+   {
+     return vector_expression< const vector_base<T>, const vector_base<T>, op_add>(v1, v2);
+   }
+ 
+ 
+ 
+   //
+   // operator -
+   //
+ 
+   /** @brief Operator overload for the subtraction of two vector expressions.
+   *
+   * @param proxy1  Left hand side vector expression
+   * @param proxy2  Right hand side vector expression
+   */
+   template <typename LHS1, typename RHS1, typename OP1,
+             typename LHS2, typename RHS2, typename OP2>
+   vector_expression< const vector_expression< LHS1, RHS1, OP1>,
+                      const vector_expression< LHS2, RHS2, OP2>,
+                      viennacl::op_sub>
+   operator - (vector_expression<LHS1, RHS1, OP1> const & proxy1,
+               vector_expression<LHS2, RHS2, OP2> const & proxy2)
+   {
+     assert(proxy1.size() == proxy2.size() && bool("Incompatible vector sizes!"));
+     return   vector_expression< const vector_expression<LHS1, RHS1, OP1>,
+                                 const vector_expression<LHS2, RHS2, OP2>,
+                                 viennacl::op_sub>(proxy1, proxy2);
+   }
+ 
+ 
+   /** @brief Operator overload for the subtraction of a vector expression with a vector or another vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+   *
+   * @param proxy   Left hand side vector expression
+   * @param vec     Right hand side vector (also -range and -slice is allowed)
+   */
+   template <typename LHS, typename RHS, typename OP, typename T>
+   vector_expression< const vector_expression<LHS, RHS, OP>,
+                      const vector_base<T>,
+                      viennacl::op_sub>
+   operator - (vector_expression<LHS, RHS, OP> const & proxy,
+               vector_base<T> const & vec)
+   {
+     assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+     return vector_expression< const vector_expression<LHS, RHS, OP>,
+                               const vector_base<T>,
+                               viennacl::op_sub>(proxy, vec);
+   }
+ 
+   /** @brief Operator overload for the subtraction of a vector expression with a vector or another vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+   *
+   * @param proxy   Left hand side vector expression
+   * @param vec     Right hand side vector (also -range and -slice is allowed)
+   */
+   template <typename T, typename LHS, typename RHS, typename OP>
+   vector_expression< const vector_base<T>,
+                      const vector_expression<LHS, RHS, OP>,
+                      viennacl::op_sub>
+   operator - (vector_base<T> const & vec,
+               vector_expression<LHS, RHS, OP> const & proxy)
+   {
+     assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+     return vector_expression< const vector_base<T>,
+                               const vector_expression<LHS, RHS, OP>,
+                               viennacl::op_sub>(vec, proxy);
+   }
+ 
+   /** @brief Returns an expression template object for subtracting two vectors, i.e. v1 - v2
+   */
+   template <typename T>
+   vector_expression< const vector_base<T>, const vector_base<T>, op_sub>
+   operator - (const vector_base<T> & v1, const vector_base<T> & v2)
+   {
+     return vector_expression< const vector_base<T>, const vector_base<T>, op_sub>(v1, v2);
+   }
+ 
+ 
+   //
+   // operator *
+   //
+ 
+ 
+   /** @brief Operator overload for the expression alpha * v1, where alpha is a host scalar (float or double) and v1 is a ViennaCL vector.
+   *
+   * @param value   The host scalar (float or double)
+   * @param vec     A ViennaCL vector
+   */
+   template <typename S1, typename T>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 vector_expression< const vector_base<T>, const S1, op_mult> >::type
+   operator * (S1 const & value, vector_base<T> const & vec)
+   {
+     return vector_expression< const vector_base<T>, const S1, op_mult>(vec, value);
+   }
+ 
+   /** @brief Operator overload for the expression alpha * v1, where alpha is a char
+   *
+   * @param value   The host scalar (float or double)
+   * @param vec     A ViennaCL vector
+   */
+   template <typename T>
+   vector_expression< const vector_base<T>, const T, op_mult>
+   operator * (char value, vector_base<T> const & vec)
+   {
+     return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+   }
+ 
+   /** @brief Operator overload for the expression alpha * v1, where alpha is a short
+   *
+   * @param value   The host scalar (float or double)
+   * @param vec     A ViennaCL vector
+   */
+   template <typename T>
+   vector_expression< const vector_base<T>, const T, op_mult>
+   operator * (short value, vector_base<T> const & vec)
+   {
+     return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+   }
+ 
+   /** @brief Operator overload for the expression alpha * v1, where alpha is a int
+   *
+   * @param value   The host scalar (float or double)
+   * @param vec     A ViennaCL vector
+   */
+   template <typename T>
+   vector_expression< const vector_base<T>, const T, op_mult>
+   operator * (int value, vector_base<T> const & vec)
+   {
+     return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+   }
+ 
+   /** @brief Operator overload for the expression alpha * v1, where alpha is a long
+   *
+   * @param value   The host scalar (float or double)
+   * @param vec     A ViennaCL vector
+   */
+   template <typename T>
+   vector_expression< const vector_base<T>, const T, op_mult>
+   operator * (long value, vector_base<T> const & vec)
+   {
+     return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+   }
+ 
+ 
+ 
+ 
+   /** @brief Operator overload for the expression alpha * v1, where alpha is a scalar expression and v1 is a ViennaCL vector.
+   *
+   * @param expr    The scalar expression
+   * @param vec     A ViennaCL vector
+   */
+   template <typename LHS, typename RHS, typename OP, typename T>
+   vector_expression< const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult>
+   operator * (scalar_expression<LHS, RHS, OP> const & expr, vector_base<T> const & vec)
+   {
+     return vector_expression< const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult>(vec, expr);
+   }
+ 
+   /** @brief Scales the vector by a scalar 'alpha' and returns an expression template
+   */
+   template <typename T, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 vector_expression< const vector_base<T>, const S1, op_mult> >::type
+   operator * (vector_base<T> const & vec, S1 const & value)
+   {
+     return vector_expression< const vector_base<T>, const S1, op_mult>(vec, value);
+   }
+ 
+   template <typename T>
+   vector_expression< const vector_base<T>, const T, op_mult>
+   operator * (vector_base<T> const & vec, T const & value)
+   {
+     return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+   }
+ 
+   /** @brief Operator overload for the multiplication of a vector expression with a scalar from the right, e.g. (beta * vec1) * alpha. Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the right.
+   *
+   * @param proxy   Left hand side vector expression
+   * @param val     Right hand side scalar
+   */
+   template <typename LHS, typename RHS, typename OP, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>  >::type
+   operator * (vector_expression< LHS, RHS, OP> const & proxy,
+               S1 const & val)
+   {
+     return viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+   }
+ 
+   /** @brief Operator overload for the multiplication of a vector expression with a ViennaCL scalar from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
+   *
+   * @param val     Right hand side scalar
+   * @param proxy   Left hand side vector expression
+   */
+   template <typename S1, typename LHS, typename RHS, typename OP>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>  >::type
+   operator * (S1 const & val,
+               vector_expression<LHS, RHS, OP> const & proxy)
+   {
+     return viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+   }
+ 
+   //
+   // operator /
+   //
+ 
+   /** @brief Operator overload for the division of a vector expression by a scalar from the right, e.g. (beta * vec1) / alpha. Here, beta * vec1 is wrapped into a vector_expression and then divided by alpha.
+   *
+   * @param proxy   Left hand side vector expression
+   * @param val     Right hand side scalar
+   */
+   template <typename S1, typename LHS, typename RHS, typename OP>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_div>  >::type
+   operator / (vector_expression< LHS, RHS, OP> const & proxy,
+               S1 const & val)
+   {
+     return viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_div>(proxy, val);
+   }
+ 
+ 
+   /** @brief Returns an expression template for scaling the vector by a GPU scalar 'alpha'
+   */
+   template <typename T, typename S1>
+   typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                 vector_expression< const vector_base<T>, const S1, op_div> >::type
+   operator / (vector_base<T> const & v1, S1 const & s1)
+   {
+     return vector_expression<const vector_base<T>, const S1, op_div>(v1, s1);
+   }
+ 
+ 
+ 
+   //
+   // Specify available operations:
+   //
+ 
+   /** \cond */
+ 
+   namespace linalg
+   {
+     namespace detail
+     {
+       // x = y
+       template <typename T>
+       struct op_executor<vector_base<T>, op_assign, vector_base<T> >
+       {
+         static void apply(vector_base<T> & lhs, vector_base<T> const & rhs)
+         {
+           viennacl::linalg::av(lhs, rhs, T(1), 1, false, false);
+         }
+       };
+ 
+       // x = inner_prod(z, {y0, y1, ...})
+       template <typename T>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const vector_tuple<T>, op_inner_prod> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_tuple<T>, op_inner_prod> const & rhs)
+         {
+           viennacl::linalg::inner_prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+         }
+       };
+ 
+       // x += y
+       template <typename T>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_base<T> >
+       {
+         static void apply(vector_base<T> & lhs, vector_base<T> const & rhs)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, false);
+         }
+       };
+ 
+       // x -= y
+       template <typename T>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_base<T> >
+       {
+         static void apply(vector_base<T> & lhs, vector_base<T> const & rhs)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, true);
+         }
+       };
+ 
+       ///////////// x  OP  y * alpha ////////////////////////
+ 
+ 
+       // x = alpha * y
+       template <typename T, typename ScalarType>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const ScalarType, op_mult> >
+       {
+         // generic case: ScalarType is a scalar expression
+         template <typename LHS, typename RHS, typename OP>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult> const & proxy)
+         {
+           T alpha = proxy.rhs();
+           viennacl::linalg::av(lhs, proxy.lhs(), alpha, 1, false, false);
+         }
+ 
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar<T>, op_mult> const & proxy)
+         {
+           viennacl::linalg::av(lhs, proxy.lhs(), proxy.rhs(), 1, false, false);
+         }
+ 
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const T, op_mult> const & proxy)
+         {
+           viennacl::linalg::av(lhs, proxy.lhs(), proxy.rhs(), 1, false, false);
+         }
+       };
+ 
+       // x += alpha * y
+       template <typename T, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_base<T>, const ScalarType, op_mult> >
+       {
+         // generic case: ScalarType is a scalar expression
+         template <typename LHS, typename RHS, typename OP>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult> const & proxy)
+         {
+           T alpha = proxy.rhs();
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), alpha, 1, false, false);
+         }
+ 
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar<T>, op_mult> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, false);
+         }
+ 
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const T, op_mult> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, false);
+         }
+       };
+ 
+       // x -= alpha * y
+       template <typename T, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_base<T>, const ScalarType, op_mult> >
+       {
+         // generic case: ScalarType is a scalar expression
+         template <typename LHS, typename RHS, typename OP>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult> const & proxy)
+         {
+           T alpha = proxy.rhs();
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), alpha, 1, false, true);
+         }
+ 
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar<T>, op_mult> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, true);
+         }
+ 
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const T, op_mult> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, true);
+         }
+       };
+ 
+ 
+       ///////////// x  OP  vec_expr * alpha ////////////////////////
+ 
+       // x = alpha * vec_expr
+       template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+       {
+           static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+           {
+             vector<T> temp(proxy.lhs());
+             lhs = temp * proxy.rhs();
+           }
+       };
+ 
+       // x += alpha * vec_expr
+       template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+       {
+           static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+           {
+             vector<T> temp(proxy.lhs());
+             lhs += temp * proxy.rhs();
+           }
+       };
+ 
+       // x -= alpha * vec_expr
+       template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+       {
+           static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+           {
+             vector<T> temp(proxy.lhs());
+             lhs -= temp * proxy.rhs();
+           }
+       };
+ 
+ 
+       ///////////// x  OP  y / alpha ////////////////////////
+ 
+       // x = y / alpha
+       template <typename T, typename ScalarType>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const ScalarType, op_div> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const ScalarType, op_div> const & proxy)
+         {
+           viennacl::linalg::av(lhs, proxy.lhs(), proxy.rhs(), 1, true, false);
+         }
+       };
+ 
+       // x += y / alpha
+       template <typename T, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_base<T>, const ScalarType, op_div> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const ScalarType, op_div> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, false);
+         }
+       };
+ 
+       // x -= y / alpha
+       template <typename T, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_base<T>, const ScalarType, op_div> >
+       {
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const ScalarType, op_div> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+       ///////////// x  OP  vec_expr / alpha ////////////////////////
+ 
+       // x = vec_expr / alpha
+       template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+       {
+           static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+           {
+             vector<T> temp(proxy.lhs());
+             lhs = temp / proxy.rhs();
+           }
+       };
+ 
+       // x += vec_expr / alpha
+       template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+       {
+           static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+           {
+             vector<T> temp(proxy.lhs());
+             lhs += temp / proxy.rhs();
+           }
+       };
+ 
+       // x -= vec_expr / alpha
+       template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+       {
+           static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+           {
+             vector<T> temp(proxy.lhs());
+             lhs -= temp / proxy.rhs();
+           }
+       };
+ 
+ 
+ 
+       // generic x = vec_expr1 + vec_expr2:
+       template <typename T, typename LHS, typename RHS>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_add> >
+       {
+         // generic x = vec_expr1 + vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_add> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             vector_base<T> temp(proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+             lhs = temp;
+           }
+           else
+           {
+             op_executor<vector_base<T>, op_assign, LHS>::apply(lhs, proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x = y + z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x = alpha * y + z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                   const vector_base<T>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x = y / alpha + z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                   const vector_base<T>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x = y + beta * z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x = y + z / beta
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x = alpha * y + beta * z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x = alpha * y + z / beta
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x = y / alpha + beta * z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x = y / alpha + z / beta
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+       };
+ 
+ 
+       // generic x += vec_expr1 + vec_expr2:
+       template <typename T, typename LHS, typename RHS>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_add> >
+       {
+         // generic x += vec_expr1 + vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_add> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             vector_base<T> temp(proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+             lhs += temp;
+           }
+           else
+           {
+             op_executor<vector_base<T>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x += y + z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x += alpha * y + z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   const vector_base<T>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x += y / alpha + z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   const vector_base<T>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x += y + beta * z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x += y + z / beta
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x += alpha * y + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x += alpha * y + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x += y / alpha + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x += y / alpha + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+       };
+ 
+ 
+ 
+       // generic x -= vec_expr1 + vec_expr2:
+       template <typename T, typename LHS, typename RHS>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_add> >
+       {
+         // generic x -= vec_expr1 + vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_add> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             vector_base<T> temp(proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+             lhs -= temp;
+           }
+           else
+           {
+             op_executor<vector_base<T>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x -= y + z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x -= alpha * y + z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   const vector_base<T>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x -= y / alpha + z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   const vector_base<T>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x -= y + beta * z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x -= y + z / beta
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x -= alpha * y + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x -= alpha * y + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x -= y / alpha + beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x -= y / alpha + z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_add> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+ 
+       ///////////////////////
+ 
+ 
+ 
+       // generic x = vec_expr1 - vec_expr2:
+       template <typename T, typename LHS, typename RHS>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_sub> >
+       {
+         // generic x = vec_expr1 - vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_sub> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             vector_base<T> temp(proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+             lhs = temp;
+           }
+           else
+           {
+             op_executor<vector_base<T>, op_assign, LHS>::apply(lhs, proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x = y - z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x = alpha * y - z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   const vector_base<T>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x = y / alpha - z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   const vector_base<T>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x = y - beta * z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x = y - z / beta
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs(), T(1), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x = alpha * y - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x = alpha * y - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x = y / alpha - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x = y / alpha - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv(lhs,
+                                  proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                  proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+       // generic x += vec_expr1 - vec_expr2:
+       template <typename T, typename LHS, typename RHS>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_sub> >
+       {
+         // generic x += vec_expr1 - vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_sub> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             vector_base<T> temp(proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+             lhs += temp;
+           }
+           else
+           {
+             op_executor<vector_base<T>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x += y - z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x += alpha * y - z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   const vector_base<T>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x += y / alpha - z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   const vector_base<T>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs(), T(1), 1, false, true);
+         }
+ 
+         // x += y - beta * z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x += y - z / beta
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x += alpha * y - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x += alpha * y - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+ 
+         // x += y / alpha - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+         }
+ 
+         // x += y / alpha - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+         }
+       };
+ 
+ 
+ 
+       // generic x -= vec_expr1 - vec_expr2:
+       template <typename T, typename LHS, typename RHS>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_sub> >
+       {
+         // generic x -= vec_expr1 - vec_expr2:
+         template <typename LHS1, typename RHS1>
+         static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_sub> const & proxy)
+         {
+           bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+           bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+ 
+           if (op_aliasing_lhs || op_aliasing_rhs)
+           {
+             vector_base<T> temp(proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+             lhs -= temp;
+           }
+           else
+           {
+             op_executor<vector_base<T>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+             op_executor<vector_base<T>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+           }
+         }
+ 
+         // x -= y - z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x -= alpha * y - z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   const vector_base<T>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x -= y / alpha - z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   const vector_base<T>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs(), T(1), 1, false, false);
+         }
+ 
+         // x -= y - beta * z
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x -= y - z / beta
+         template <typename ScalarType>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs(), T(1), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x -= alpha * y - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x -= alpha * y - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+ 
+         // x -= y / alpha - beta * z
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+         }
+ 
+         // x -= y / alpha - z / beta
+         template <typename ScalarType1, typename ScalarType2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                   const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                   op_sub> const & proxy)
+         {
+           viennacl::linalg::avbv_v(lhs,
+                                    proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                    proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+         }
+       };
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+       //////////////////// Element-wise operations ////////////////////////////////////////
+ 
+       // generic x = vec_expr1 .* vec_expr2:
+       template <typename T, typename LHS, typename RHS, typename OP>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_element_binary<OP> > >
+       {
+         // x = y .* z  or  x = y ./ z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+         {
+           viennacl::linalg::element_op(lhs, proxy);
+         }
+ 
+         // x = y .* vec_expr  or  x = y ./ vec_expr
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.rhs());
+           viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(proxy.lhs(), temp));
+         }
+ 
+         // x = vec_expr .* z  or  x = vec_expr ./ z
+         template <typename LHS1, typename RHS1, typename OP1>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.lhs());
+           viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp, proxy.rhs()));
+         }
+ 
+         // x = vec_expr .* vec_expr  or  z = vec_expr .* vec_expr
+         template <typename LHS1, typename RHS1, typename OP1,
+                   typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>,
+                                                                   const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp1(proxy.lhs());
+           vector<T> temp2(proxy.rhs());
+           viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp1, temp2));
+         }
+       };
+ 
+       // generic x += vec_expr1 .* vec_expr2:
+       template <typename T, typename LHS, typename RHS, typename OP>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_element_binary<OP> > >
+       {
+         // x += y .* z  or  x += y ./ z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+         {
+           viennacl::vector<T> temp(proxy);
+           lhs += temp;
+         }
+ 
+         // x += y .* vec_expr  or  x += y ./ vec_expr
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_expression<const LHS2, const RHS2, OP2>,  op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.rhs());
+           vector<T> temp2(temp.size());
+           viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(proxy.lhs(), temp));
+           lhs += temp2;
+         }
+ 
+         // x += vec_expr .* z  or  x += vec_expr ./ z
+         template <typename LHS1, typename RHS1, typename OP1>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.lhs());
+           vector<T> temp2(temp.size());
+           viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp, proxy.rhs()));
+           lhs += temp2;
+         }
+ 
+         // x += vec_expr .* vec_expr  or  x += vec_expr ./ vec_expr
+         template <typename LHS1, typename RHS1, typename OP1,
+                   typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>,
+                                                                   const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp1(proxy.lhs());
+           vector<T> temp2(proxy.rhs());
+           vector<T> temp3(temp1.size());
+           viennacl::linalg::element_op(temp3, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp1, temp2));
+           lhs += temp3;
+         }
+       };
+ 
+       // generic x -= vec_expr1 .* vec_expr2:
+       template <typename T, typename LHS, typename RHS, typename OP>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_element_binary<OP> > >
+       {
+ 
+         // x -= y .* z  or  x -= y ./ z
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+         {
+           viennacl::vector<T> temp(proxy);
+           lhs -= temp;
+         }
+ 
+         // x -= y .* vec_expr  or  x -= y ./ vec_expr
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.rhs());
+           vector<T> temp2(temp.size());
+           viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(proxy.lhs(), temp));
+           lhs -= temp2;
+         }
+ 
+         // x -= vec_expr .* z  or  x -= vec_expr ./ z
+         template <typename LHS1, typename RHS1, typename OP1>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.lhs());
+           vector<T> temp2(temp.size());
+           viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp, proxy.rhs()));
+           lhs -= temp2;
+         }
+ 
+         // x -= vec_expr .* vec_expr  or  x -= vec_expr ./ vec_expr
+         template <typename LHS1, typename RHS1, typename OP1,
+                   typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>,
+                                                                   const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_binary<OP> > const & proxy)
+         {
+           vector<T> temp1(proxy.lhs());
+           vector<T> temp2(proxy.rhs());
+           vector<T> temp3(temp1.size());
+           viennacl::linalg::element_op(temp3, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp1, temp2));
+           lhs -= temp3;
+         }
+       };
+ 
+       //////////////// unary expressions
+ 
+       template <typename T, typename LHS, typename RHS, typename OP>
+       struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_element_unary<OP> > >
+       {
+         // x = OP(y)
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+         {
+           viennacl::linalg::element_op(lhs, proxy);
+         }
+ 
+         // x = OP(vec_expr)
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_unary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.rhs());
+           viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> >(temp, temp));
+         }
+       };
+ 
+       template <typename T, typename LHS, typename RHS, typename OP>
+       struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_element_unary<OP> > >
+       {
+         // x += OP(y)
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy);
+           lhs += temp;
+         }
+ 
+         // x += OP(vec_expr)
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_unary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.rhs());
+           viennacl::linalg::element_op(temp, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+           lhs += temp;
+         }
+       };
+ 
+       template <typename T, typename LHS, typename RHS, typename OP>
+       struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_element_unary<OP> > >
+       {
+         // x -= OP(y)
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy);
+           lhs -= temp;
+         }
+ 
+         // x -= OP(vec_expr)
+         template <typename LHS2, typename RHS2, typename OP2>
+         static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                   op_element_unary<OP> > const & proxy)
+         {
+           vector<T> temp(proxy.rhs());
+           viennacl::linalg::element_op(temp, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+           lhs -= temp;
+         }
+       };
+ 
+     } // namespace detail
+ 
+   } // namespace linalg
+ 
+   /** \endcond */
+ 
+ } // namespace viennacl
+ 
+ #endif
++>>>>>>> upstream/1.5.1
diff --cc viennacl/vector_proxy.hpp
index f6d7107,a7f2cfa..26d4847
--- a/viennacl/vector_proxy.hpp
+++ b/viennacl/vector_proxy.hpp
@@@ -29,12 -30,16 +30,20 @@@
  
  namespace viennacl
  {
- 
+   /** @brief Class for representing non-strided subvectors of a bigger vector x.
+     *
+     * In MATLAB notation, this could for example refer to the subvector x(3:8) of a vector x.
+     */
    template <typename VectorType>
-   class vector_range
+   class vector_range : public vector_base<typename VectorType::cpu_value_type>
    {
        typedef vector_range<VectorType>             self_type;
++<<<<<<< HEAD
 +    
++=======
+       typedef vector_base<typename VectorType::cpu_value_type> base_type;
+ 
++>>>>>>> upstream/1.5.1
      public:
        typedef typename VectorType::value_type      value_type;
        typedef range::size_type                     size_type;
@@@ -43,260 -48,25 +52,262 @@@
        typedef const value_type &                   const_reference;
        typedef typename VectorType::const_iterator  const_iterator;
        typedef typename VectorType::iterator        iterator;
++<<<<<<< HEAD
 +      
 +
 +      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
 +      
 +      static const int alignment = VectorType::alignment;
 +      
 +      vector_range(VectorType & v, 
 +                   range const & entry_range) : v_(v), entry_range_(entry_range) {}
 +                   
 +      size_type start() const { return entry_range_.start(); }
 +      size_type size() const { return entry_range_.size(); }
 +
 +      
 +      /** @brief Operator overload for v1 = A * v2, where v1 and v2 are vector ranges and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename MatrixType>
 +      typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value, self_type &>::type
 +      operator=(const vector_expression< const MatrixType,
 +                                         const self_type,
 +                                         op_prod> & proxy);
 +      
 +      
 +      
 +
 +      template <typename LHS, typename RHS, typename OP>
 +      self_type & operator=(const vector_expression< LHS,
 +                                                     RHS,
 +                                                     OP > & proxy) 
 +      {
 +        VectorType temp = proxy;
 +        *this = temp;
 +        return *this;
 +      }      
 +
 +
 +      /** @brief Convenience function, which allows to assign a vector directly to a vector range of suitable size */
 +      self_type & operator=(const VectorType & v) 
 +      {
 +        assert(size() == v.size() && "Vector range and vector size mismatch!");
 +        
 +        if (size() > 0)
 +        {
 +          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                           v.handle().get(),     // src buffer
 +                                           v_.handle().get(),    //dest buffer
 +                                           0,                    // src offset
 +                                           sizeof(cpu_value_type) * start(), //dest offset
 +                                           sizeof(cpu_value_type) * size(),  //number of bytes to be copied
 +                                           0, NULL, NULL);
 +                                           
 +          VIENNACL_ERR_CHECK(err);
 +        }
 +        
 +        return *this;
 +      }      
 +
 +      /** @brief Convenience function, which allows to assign a vector range directly to another vector range of suitable size */
 +      self_type & operator=(const self_type & v) 
 +      {
 +        assert(size() == v.size() && "Sizes of vector ranges don't match!");
 +        
 +        if (size() > 0)
 +        {
 +          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                           v.get().handle().get(),   // src buffer
 +                                           v_.handle().get(),        //dest buffer
 +                                           sizeof(cpu_value_type) * v.start(),   // src offset
 +                                           sizeof(cpu_value_type) * start(),     //dest offset
 +                                           sizeof(cpu_value_type) * size(),      //number of bytes to be copied
 +                                           0, NULL, NULL);
 +                                           
 +          VIENNACL_ERR_CHECK(err);
 +        }
 +        
 +        return *this;
 +      }      
 +
 +      ///////////// operator +=
 +
 +      self_type & operator += (VectorType const & other)
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +
 +      self_type & operator += (self_type const & other)
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +      
 +      ///////////// operator -=
 +
 +      self_type & operator -= (VectorType const & other)
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      self_type & operator -= (self_type const & other)
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      ///////////// operator *=
 +      self_type & operator *= (cpu_value_type const & cpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, cpu_val);
 +        return *this;
 +      }
 +      
 +      self_type & operator *= (value_type const & gpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, gpu_val);
 +        return *this;
 +      }
 +
 +      ///////////// operator /=
 +      self_type & operator /= (cpu_value_type const & cpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, cpu_value_type(1) / cpu_val);
 +        return *this;
 +      }
 +      
 +      self_type & operator /= (value_type const & gpu_val)
 +      {
 +        viennacl::linalg::inplace_divide(*this, gpu_val);
 +        return *this;
 +      }
 +      
 +      
 +      ///////////// Direct manipulation via operator() and operator[]
 +      //read-write access to an element of the vector
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<cpu_value_type> operator()(size_type index)
 +      {
 +        return entry_proxy<cpu_value_type>(index + start(), v_.get());
 +      }
 +
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<cpu_value_type> operator[](size_type index)
 +      {
 +        return entry_proxy<cpu_value_type>(index + start(), v_.get());
 +      }
 +
 +
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<cpu_value_type> operator()(size_type index) const
 +      {
 +        scalar<cpu_value_type> tmp;
 +        cl_int err;
 +        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), v_.get(), tmp.handle(), sizeof(cpu_value_type)*(index + start()), 0, sizeof(cpu_value_type), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        return tmp;
 +      }
 +      
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<cpu_value_type> operator[](size_type index) const
 +      {
 +        return operator()(index);
 +      }
 +      
 +      ///////////// iterators:
 +      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
 +      iterator begin()
 +      {
 +        return iterator(v_, 0, start());
 +      }
 +
 +      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
 +      iterator end()
 +      {
 +        return iterator(v_, size(), start());
 +      }
 +
 +      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
 +      const_iterator begin() const
 +      {
 +        return const_iterator(v_, start());
 +      }
 +
 +      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
 +      const_iterator end() const
 +      {
 +        return const_iterator(v_, size(), start());
 +      }
 +      
 +      ///////////// Misc
 +
 +      VectorType & get() { return v_; }
 +      const VectorType & get() const { return v_; }
 +
 +    private:
 +      VectorType & v_;
 +      range entry_range_;
++=======
+ 
+       typedef typename VectorType::cpu_value_type    cpu_value_type;
+ 
+       static const int alignment = VectorType::alignment;
+ 
+       vector_range(VectorType & v, range const & entry_range)
+        : base_type(v.handle(), entry_range.size(), v.start() + v.stride() * entry_range.start(), v.stride()) {}
+ 
+ 
+       using base_type::operator=;
+ 
++>>>>>>> upstream/1.5.1
    };
 +  
 +  
 +  //implement operator= for vector:
 +  
 +  template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
 +  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const vector_range< viennacl::vector<SCALARTYPE, ALIGNMENT> > & r) 
 +  {
 +    assert(this->size() == r.size() && "Vector size mismatch!");
 +    
 +    if (this->size() > 0)
 +    {
 +      cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                        r.get().handle().get(),      // src buffer
 +                                        this->handle().get(),        //dest buffer
 +                                        sizeof(SCALARTYPE) * r.start(),       // src offset
 +                                        0,                                    //dest offset
 +                                        sizeof(SCALARTYPE) * r.size(), //number of bytes to be copied
 +                                        0, NULL, NULL);
 +                                        
 +      VIENNACL_ERR_CHECK(err);
 +    }
 +    
 +    return *this;
 +  }
 +  
 +  
 +  
  
-   
-   template<typename VectorType>
-   std::ostream & operator<<(std::ostream & s, vector_range<VectorType> const & proxy)
-   {
-     typedef typename VectorType::value_type   ScalarType;
-     std::vector<ScalarType> temp(proxy.size());
-     viennacl::copy(proxy, temp);
-     
-     //instead of printing 'temp' directly, let's reuse the existing functionality for viennacl::vector. It certainly adds overhead, but printing a vector is typically not about performance...
-     VectorType temp2(temp.size());
-     viennacl::copy(temp, temp2);
-     s << temp2;
-     return s;
-   }
-   
-   
-   
-   
+ 
+ 
    /////////////////////////////////////////////////////////////
    ///////////////////////// CPU to GPU ////////////////////////
    /////////////////////////////////////////////////////////////
++<<<<<<< HEAD
 +  
++=======
+ 
++>>>>>>> upstream/1.5.1
    template <typename VectorType, typename SCALARTYPE>
    void copy(const VectorType & cpu_vector,
              vector_range<vector<SCALARTYPE> > & gpu_vector_range )
@@@ -331,8 -97,8 +338,11 @@@
    /////////////////////////////////////////////////////////////
    ///////////////////////// GPU to CPU ////////////////////////
    /////////////////////////////////////////////////////////////
-   
  
++<<<<<<< HEAD
++=======
+ 
++>>>>>>> upstream/1.5.1
    template <typename SCALARTYPE, typename VectorType>
    void copy(vector_range<vector<SCALARTYPE> > const & gpu_vector_range,
              VectorType & cpu_vector)
@@@ -378,6 -139,12 +383,15 @@@
      return vector_range<VectorType>(vec, r1);
    }
  
++<<<<<<< HEAD
++=======
+   template <typename VectorType>
+   vector_range<VectorType> project(viennacl::vector_range<VectorType> & vec, viennacl::range const & r1)
+   {
+     assert(r1.size() <= vec.size() && bool("Size of range invalid!"));
+     return vector_range<VectorType>(vec, viennacl::range(vec.start() + r1.start(), vec.start() + r1.start() + r1.size()));
+   }
++>>>>>>> upstream/1.5.1
  
  //
  //
@@@ -389,12 -156,16 +403,25 @@@
  
  
  
++<<<<<<< HEAD
 +
 +  template <typename VectorType>
 +  class vector_slice
 +  {
 +      typedef vector_slice<VectorType>             self_type;
 +    
++=======
+   /** @brief Class for representing strided subvectors of a bigger vector x.
+     *
+     * In MATLAB notation, this could for example refer to the subvector x(3:2:8) of a vector x.
+     */
+   template <typename VectorType>
+   class vector_slice : public vector_base<typename VectorType::cpu_value_type>
+   {
+       typedef vector_slice<VectorType>             self_type;
+       typedef vector_base<typename VectorType::cpu_value_type> base_type;
+ 
++>>>>>>> upstream/1.5.1
      public:
        typedef typename VectorType::value_type      value_type;
        typedef slice::size_type                     size_type;
@@@ -403,261 -174,38 +430,295 @@@
        typedef const value_type &                   const_reference;
        typedef typename VectorType::const_iterator  const_iterator;
        typedef typename VectorType::iterator        iterator;
++<<<<<<< HEAD
 +      
 +
 +      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
 +      
 +      static const int alignment = VectorType::alignment;
 +      
 +      vector_slice(VectorType & v, 
 +                   slice const & entry_slice) : v_(v), entry_slice_(entry_slice) {}
 +                   
 +      size_type start() const { return entry_slice_.start(); }
 +      size_type stride() const { return entry_slice_.stride(); }
 +      size_type size() const { return entry_slice_.size(); }
 +
 +      
 +      /** @brief Operator overload for v1 = A * v2, where v1 and v2 are vector slices and A is a dense matrix.
 +      *
 +      * @param proxy An expression template proxy class
 +      */
 +      template <typename MatrixType>
 +      typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value, self_type &>::type
 +      operator=(const vector_expression< const MatrixType,
 +                                         const self_type,
 +                                         op_prod> & proxy);
 +      
 +      
 +      
 +
 +      template <typename LHS, typename RHS, typename OP>
 +      self_type & operator=(const vector_expression< LHS,
 +                                                     RHS,
 +                                                     OP > & proxy) 
 +      {
 +        VectorType temp = proxy;
 +        *this = temp;
 +        return *this;
 +      }      
 +
 +
 +      /** @brief Convenience function, which allows to assign a vector directly to a vector slice of suitable size */
 +      self_type & operator=(const VectorType & v) 
 +      {
 +        assert(size() == v.size() && "Vector slice and vector size mismatch!");
 +        
 +        if (size() > 0)
 +          viennacl::linalg::assign(*this, v);
 +        
 +        return *this;
 +      }      
 +
 +      /** @brief Convenience function, which allows to assign a vector slice directly to another vector slice of suitable size */
 +      self_type & operator=(const self_type & v) 
 +      {
 +        assert(size() == v.size() && "Sizes of vector slices don't match!");
 +        
 +        if (size() > 0)
 +          viennacl::linalg::assign(*this, v);
 +        
 +        return *this;
 +      }      
 +
 +      ///////////// operator +=
 +
 +      self_type & operator += (VectorType const & other)
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +
 +      self_type & operator += (self_type const & other)
 +      {
 +        viennacl::linalg::inplace_add(*this, other);
 +        return *this;
 +      }
 +      
 +      ///////////// operator -=
 +
 +      self_type & operator -= (VectorType const & other)
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      self_type & operator -= (self_type const & other)
 +      {
 +        viennacl::linalg::inplace_sub(*this, other);
 +        return *this;
 +      }
 +
 +      ///////////// operator *=
 +      self_type & operator *= (cpu_value_type const & cpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, cpu_val);
 +        return *this;
 +      }
 +      
 +      self_type & operator *= (value_type const & gpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, gpu_val);
 +        return *this;
 +      }
 +
 +      ///////////// operator /=
 +      self_type & operator /= (cpu_value_type const & cpu_val)
 +      {
 +        viennacl::linalg::inplace_mult(*this, cpu_value_type(1) / cpu_val);
 +        return *this;
 +      }
 +      
 +      self_type & operator /= (value_type const & gpu_val)
 +      {
 +        viennacl::linalg::inplace_divide(*this, gpu_val);
 +        return *this;
 +      }
 +      
 +      
 +      ///////////// Direct manipulation via operator() and operator[]
 +      //read-write access to an element of the vector
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<cpu_value_type> operator()(size_type index)
 +      {
 +        return entry_proxy<cpu_value_type>(index * stride() + start(), v_.get());
 +      }
 +
 +      /** @brief Read-write access to a single element of the vector
 +      */
 +      entry_proxy<cpu_value_type> operator[](size_type index)
 +      {
 +        return entry_proxy<cpu_value_type>(index * stride() + start(), v_.get());
 +      }
 +
 +
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<cpu_value_type> operator()(size_type index) const
 +      {
 +        scalar<cpu_value_type> tmp;
 +        cl_int err;
 +        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), v_.get(), tmp.handle(), sizeof(cpu_value_type)*(index * stride() + start()), 0, sizeof(cpu_value_type), 0, NULL, NULL);
 +        VIENNACL_ERR_CHECK(err);
 +        return tmp;
 +      }
 +      
 +      /** @brief Read access to a single element of the vector
 +      */
 +      scalar<cpu_value_type> operator[](size_type index) const
 +      {
 +        return operator()(index);
 +      }
 +      
 +      ///////////// iterators:
 +      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
 +      iterator begin()
 +      {
 +        return iterator(v_, 0, start(), stride());
 +      }
 +
 +      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
 +      iterator end()
 +      {
 +        return iterator(v_, size(), start(), stride());
 +      }
 +
 +      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
 +      const_iterator begin() const
 +      {
 +        return const_iterator(v_, 0, start(), stride());
 +      }
 +
 +      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
 +      const_iterator end() const
 +      {
 +        return const_iterator(v_, size(), start(), stride());
 +      }
 +      
 +      ///////////// Misc
 +
 +      VectorType & get() { return v_; }
 +      const VectorType & get() const { return v_; }
 +
 +    private:
 +      VectorType & v_;
 +      slice entry_slice_;
 +  };
 +  
 +  
 +  //implement operator= for vector:
 +  
 +  template <typename SCALARTYPE, unsigned int ALIGNMENT>
 +  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
 +  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const vector_slice< viennacl::vector<SCALARTYPE, ALIGNMENT> > & r) 
 +  {
 +    assert(this->size() == r.size() && "Vector size mismatch!");
 +    
 +    if (this->size() > 0)
 +      viennacl::linalg::assign(*this, r);
 +    
 +    return *this;
 +  }
 +  
 +  
 +  
 +
 +  
 +  template<typename VectorType>
 +  std::ostream & operator<<(std::ostream & s, vector_slice<VectorType> const & proxy)
 +  {
 +    typedef typename VectorType::value_type   ScalarType;
 +    std::vector<ScalarType> temp(proxy.size());
 +    viennacl::copy(proxy, temp);
 +    
 +    //instead of printing 'temp' directly, let's reuse the existing functionality for viennacl::vector. It certainly adds overhead, but printing a vector is typically not about performance...
 +    VectorType temp2(temp.size());
 +    viennacl::copy(temp, temp2);
 +    s << temp2;
 +    return s;
 +  }
 +  
 +  
 +  
 +  
 +  /////////////////////////////////////////////////////////////
 +  ///////////////////////// CPU to GPU ////////////////////////
 +  /////////////////////////////////////////////////////////////
 +  
++=======
+ 
+       typedef typename VectorType::cpu_value_type  cpu_value_type;
+ 
+       static const int alignment = VectorType::alignment;
+ 
+       vector_slice(VectorType & v, slice const & entry_slice)
+           : base_type(v.handle(), entry_slice.size(), v.start() + v.stride() * entry_slice.start(), v.stride() * entry_slice.stride()) {}
+ 
+ 
+       using base_type::operator=;
+ 
+   };
+ 
+ 
+   /////////////////////////////////////////////////////////////
+   ///////////////////////// CPU to GPU ////////////////////////
+   /////////////////////////////////////////////////////////////
+ 
++>>>>>>> upstream/1.5.1
    template <typename VectorType, typename SCALARTYPE>
    void copy(const VectorType & cpu_vector,
              vector_slice<vector<SCALARTYPE> > & gpu_vector_slice )
    {
++<<<<<<< HEAD
 +    assert(cpu_vector.end() - cpu_vector.begin() >= 0);
 +    
 +    if (cpu_vector.end() - cpu_vector.begin() > 0)
 +    {
 +      
 +      // OpenCL 1.0 version: (no use of clEnqueueWriteBufferRect())
 +      std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
 +      
 +      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                        gpu_vector_slice.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_slice.start(), 
 +                                        sizeof(SCALARTYPE)*temp_buffer.size(),
 +                                        &(temp_buffer[0]), 0, NULL, NULL);
 +      
 +      VIENNACL_ERR_CHECK(err);
 +
 +      for (std::size_t i=0; i<cpu_vector.size(); ++i)
 +      {
 +        temp_buffer[i * gpu_vector_slice.stride()] = cpu_vector[i];
 +      }
 +      
 +      err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                 gpu_vector_slice.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_slice.start(),
 +                                 sizeof(SCALARTYPE)*temp_buffer.size(),
 +                                 &(temp_buffer[0]), 0, NULL, NULL);
 +      
 +      VIENNACL_ERR_CHECK(err);
++=======
+     if (cpu_vector.size() > 0)
+     {
+       std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
+ 
+       viennacl::backend::memory_read(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+ 
+       for (vcl_size_t i=0; i<cpu_vector.size(); ++i)
+         temp_buffer[i * gpu_vector_slice.stride()] = cpu_vector[i];
+ 
+       viennacl::backend::memory_write(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
++>>>>>>> upstream/1.5.1
      }
    }
  
@@@ -666,30 -214,21 +727,46 @@@
    /////////////////////////////////////////////////////////////
    ///////////////////////// GPU to CPU ////////////////////////
    /////////////////////////////////////////////////////////////
++<<<<<<< HEAD
 +  
++=======
+ 
++>>>>>>> upstream/1.5.1
  
    template <typename VectorType, typename SCALARTYPE>
    void copy(vector_slice<vector<SCALARTYPE> > const & gpu_vector_slice,
              VectorType & cpu_vector)
    {
++<<<<<<< HEAD
 +    assert(cpu_vector.end() - cpu_vector.begin() >= 0);
 +    
 +    if (cpu_vector.end() > cpu_vector.begin())
 +    {
 +      // OpenCL 1.0 version: (no use of clEnqueueWriteBufferRect())
 +      std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
 +      
 +      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
 +                                        gpu_vector_slice.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_slice.start(), 
 +                                        sizeof(SCALARTYPE)*temp_buffer.size(),
 +                                        &(temp_buffer[0]), 0, NULL, NULL);
 +      
 +      VIENNACL_ERR_CHECK(err);
 +
 +      for (std::size_t i=0; i<cpu_vector.size(); ++i)
 +      {
 +        cpu_vector[i] = temp_buffer[i * gpu_vector_slice.stride()];
 +      }
++=======
+     assert(gpu_vector_slice.end() - gpu_vector_slice.begin() >= 0 && bool("Range must have nonnegative length!"));
+ 
+     if (gpu_vector_slice.end() - gpu_vector_slice.begin() > 0)
+     {
+       std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
+       viennacl::backend::memory_read(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+ 
+       for (vcl_size_t i=0; i<cpu_vector.size(); ++i)
+         cpu_vector[i] = temp_buffer[i * gpu_vector_slice.stride()];
++>>>>>>> upstream/1.5.1
      }
    }
  
@@@ -698,14 -237,39 +775,49 @@@
  
  
    //
++<<<<<<< HEAD
 +  // Convenience function
++=======
+   // Convenience functions
++>>>>>>> upstream/1.5.1
    //
    template <typename VectorType>
    vector_slice<VectorType> project(VectorType & vec, viennacl::slice const & s1)
    {
++<<<<<<< HEAD
++    return vector_slice<VectorType>(vec, s1);
++  }
++
++=======
+     assert(s1.size() <= vec.size() && bool("Size of slice larger than vector size!"));
      return vector_slice<VectorType>(vec, s1);
    }
  
+   template <typename VectorType>
+   vector_slice<VectorType> project(viennacl::vector_slice<VectorType> & vec, viennacl::slice const & s1)
+   {
+     assert(s1.size() <= vec.size() && bool("Size of slice larger than vector proxy!"));
+     return vector_slice<VectorType>(vec, viennacl::slice(vec.start() + s1.start(), vec.stride() * s1.stride(), s1.size()));
+   }
+ 
+   // interaction with range and vector_range:
+ 
+   template <typename VectorType>
+   vector_slice<VectorType> project(viennacl::vector_slice<VectorType> & vec, viennacl::range const & r1)
+   {
+     assert(r1.size() <= vec.size() && bool("Size of slice larger than vector proxy!"));
+     return vector_slice<VectorType>(vec, viennacl::slice(vec.start() + r1.start(), vec.stride(), r1.size()));
+   }
+ 
+   template <typename VectorType>
+   vector_slice<VectorType> project(viennacl::vector_range<VectorType> & vec, viennacl::slice const & s1)
+   {
+     assert(s1.size() <= vec.size() && bool("Size of slice larger than vector proxy!"));
+     return vector_slice<VectorType>(vec, viennacl::range(vec.start() + s1.start(), s1.stride(), s1.size()));
+   }
+ 
+ 
++>>>>>>> upstream/1.5.1
  }
  
- #endif
+ #endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/viennacl.git