[viennacl] 05/09: Reset sources

Toby St Clere Smithe tsmithe-guest at moszumanska.debian.org
Wed Feb 19 19:09:55 UTC 2014


This is an automated email from the git hooks/post-receive script.

tsmithe-guest pushed a commit to branch master
in repository viennacl.

commit c77aa54687b2a56406a257c8cf975eddd08d78ce
Author: Toby Smithe <git at tsmithe.net>
Date:   Wed Feb 19 16:16:35 2014 +0000

    Reset sources
---
 CMakeLists.txt                           |    5 -
 README                                   |    6 -
 changelog                                |    5 -
 doc/Doxyfile.in                          |    4 -
 doc/manual/algorithms.tex                |  135 ---
 doc/manual/changelogs.tex                |   21 -
 doc/manual/contributors.tex              |    6 -
 doc/manual/cover.tex                     |    4 -
 doc/manual/kernel-generation.tex         |   32 -
 doc/manual/multi-device.tex              |   17 -
 doc/manual/types.tex                     |   15 -
 doc/manual/viennacl.bib                  |    4 -
 doc/manual/viennacl.tex                  |   12 -
 examples/benchmarks/CMakeLists.txt       |   11 -
 examples/benchmarks/blas3.cpp            |  203 ----
 examples/benchmarks/solver.cpp           |  454 --------
 examples/benchmarks/sparse.cpp           |  306 ------
 examples/tutorial/CMakeLists.txt         |    7 -
 examples/tutorial/iterative-ublas.cpp    |  169 ---
 examples/tutorial/iterative.cpp          |  247 -----
 examples/tutorial/lanczos.cpp            |   47 -
 examples/tutorial/power-iter.cpp         |   44 -
 examples/tutorial/qr.cpp                 |    8 -
 tests/CMakeLists.txt                     |   11 -
 tests/src/blas3_solve_double.cpp         |  115 --
 tests/src/external_1.cpp                 |   10 -
 tests/src/external_2.cpp                 |   13 -
 tests/src/nmf.cpp                        |   72 --
 tests/src/sparse.cpp                     |   97 --
 tests/src/svd.cpp                        |  199 ----
 viennacl/ell_matrix.hpp                  |  234 ----
 viennacl/forwards.h                      |   84 --
 viennacl/generator/forwards.h            |   56 -
 viennacl/hyb_matrix.hpp                  |  265 -----
 viennacl/linalg/bicgstab.hpp             |   19 -
 viennacl/linalg/bisect.hpp               |  107 --
 viennacl/linalg/cg.hpp                   |   10 -
 viennacl/linalg/detail/ilu/block_ilu.hpp |  274 -----
 viennacl/linalg/detail/ilu/common.hpp    |  121 ---
 viennacl/linalg/detail/ilu/ilu0.hpp      |  222 ----
 viennacl/linalg/detail/ilu/ilut.hpp      |  247 -----
 viennacl/linalg/eig.hpp                  |   18 -
 viennacl/linalg/gmres.hpp                |   13 -
 viennacl/linalg/ilu.hpp                  |    4 -
 viennacl/linalg/inner_prod.hpp           |   28 -
 viennacl/linalg/jacobi_precond.hpp       |   19 -
 viennacl/linalg/lanczos.hpp              |  295 -----
 viennacl/linalg/matrix_operations.hpp    |  557 +---------
 viennacl/linalg/nmf.hpp                  |  117 --
 viennacl/linalg/norm_1.hpp               |   24 -
 viennacl/linalg/norm_2.hpp               |   25 -
 viennacl/linalg/norm_inf.hpp             |   24 -
 viennacl/linalg/power_iter.hpp           |   76 --
 viennacl/linalg/prod.hpp                 |   81 --
 viennacl/linalg/qr.hpp                   |  316 +-----
 viennacl/linalg/row_scaling.hpp          |   20 -
 viennacl/linalg/svd.hpp                  |  414 -------
 viennacl/linalg/vector_operations.hpp    |  730 -------------
 viennacl/matrix.hpp                      | 1081 -------------------
 viennacl/matrix_proxy.hpp                |  649 -----------
 viennacl/meta/predicate.hpp              |  128 ---
 viennacl/meta/result_of.hpp              |  288 -----
 viennacl/meta/tag_of.hpp                 |    4 -
 viennacl/ocl/backend.hpp                 |   23 -
 viennacl/ocl/context.hpp                 |   41 -
 viennacl/ocl/enqueue.hpp                 |  160 ---
 viennacl/ocl/kernel.hpp                  |  754 -------------
 viennacl/ocl/platform.hpp                |   10 -
 viennacl/slice.hpp                       |   43 -
 viennacl/toeplitz_matrix.hpp             |    4 -
 viennacl/tools/adapter.hpp               |   14 -
 viennacl/tools/matrix_size_deducer.hpp   |   44 -
 viennacl/tools/tools.hpp                 |  412 -------
 viennacl/traits/handle.hpp               |  107 --
 viennacl/traits/size.hpp                 |  244 -----
 viennacl/traits/start.hpp                |  101 --
 viennacl/traits/stride.hpp               |   81 --
 viennacl/vector.hpp                      | 1726 ------------------------------
 viennacl/vector_proxy.hpp                |  548 ----------
 79 files changed, 2 insertions(+), 13139 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af714cd..bd38b04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,13 +30,8 @@ ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 ################
 
 set(VERSION_MAJOR 1)
-<<<<<<< HEAD
-set(VERSION_MINOR 3)
-set(VERSION_PATCH 0)
-=======
 set(VERSION_MINOR 5)
 set(VERSION_PATCH 1)
->>>>>>> upstream/1.5.1
 set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH})
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/README b/README
index 1897af3..b4ea993 100644
--- a/README
+++ b/README
@@ -26,18 +26,12 @@ ViennaCL requires the following:
 The first step is to extract the file:
 
 Unix-based OS:
-<<<<<<< HEAD
-$> gunzip ViennaCL-1.3.0.tar.gz
-$> tar -xf ViennaCL-1.3.0.tar
-$> cd ViennaCL-1.3.0
-=======
 $> gunzip ViennaCL-1.5.1.tar.gz
 $> tar -xf ViennaCL-1.5.1.tar
 $> cd ViennaCL-1.5.1
 
 Windows:
 Extract the file using your favorite compressor/decompressor, e.g. 7-zip.
->>>>>>> upstream/1.5.1
 
 ViennaCL is a header-only library, therefore it is sufficient to copy the subfolder viennacl/ (holding the header files) into you project directory or your system include directory. For instructions on how to set the include paths correctly, please refer to the documentation of your compiler.
 
diff --git a/changelog b/changelog
index bf2bae1..c3fdc04 100644
--- a/changelog
+++ b/changelog
@@ -2,10 +2,6 @@
 **** ViennaCL Change Logs ****
 ******************************
 
-<<<<<<< HEAD
-*** Version 1.3.x ***
-
-=======
 *** Version 1.5.x ***
 
 -- Version 1.5.1 --
@@ -141,7 +137,6 @@ The following bugfixes and enhancements have been applied:
 - Fixed a problem with matrix-matrix products if the result matrix is not initialized properly (thanks to Laszlo Marak for finding the issue and a fix).
 - The operations C += prod(A, B) and C −= prod(A, B) for matrices A, B, and C no longer introduce temporaries if the three matrices are distinct.
 
->>>>>>> upstream/1.5.1
 -- Version 1.3.0 --
 Several new features enter this new minor version release.
 Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index e68a17f..2edec42 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -31,11 +31,7 @@ PROJECT_NAME           = "ViennaCL - The Vienna Computing Library"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-<<<<<<< HEAD
-PROJECT_NUMBER         = 1.3.0
-=======
 PROJECT_NUMBER         = 1.5.1
->>>>>>> upstream/1.5.1
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/doc/manual/algorithms.tex b/doc/manual/algorithms.tex
index 1d2100b..4dba1a5 100644
--- a/doc/manual/algorithms.tex
+++ b/doc/manual/algorithms.tex
@@ -219,42 +219,14 @@ The triangular substitutions may be applied in parallel on GPUs by enabling \emp
 
 One parameter can be passed to the constructor of \lstinline|ilu0_tag|, being the boolean specifying whether level scheduling should be used.
 
-<<<<<<< HEAD
-\subsection{Incomplete LU Factorization with Static Pattern (ILU0)}
-Similar to ILUT, ILU0 computes an approximate LU factorization with sparse factors L and U.
-While ILUT determines the location of nonzero entries on the fly, ILU0 uses the sparsity pattern of A for the sparsity pattern of L and U \cite{saad-iterative-solution}.
-Due to the serial nature of the preconditioner, the setup as well as each application of ILU0 to the residual is computed on
-the CPU.
-
-\begin{lstlisting}
-//compute ILU0 preconditioner:
-ilu0_precond< SparseMatrix > vcl_ilu0(vcl_matrix,
-                                      viennacl::linalg::ilu0_tag());
-
-//solve (e.g. using conjugate gradient solver)
-vcl_result = viennacl::linalg::solve(vcl_matrix,
-                                     vcl_rhs,
-                                     viennacl::linalg::bicgstab_tag(),
-                                     vcl_ilut);   //preconditioner here
-\end{lstlisting}
-Two parameters can be passed to the constructor of \lstinline|ilu0_tag|:
-The first parameter specifies the lower row and column index for which ILU0 should be computed, while the second parameter specifies the upper bound on the row and column indices considered.
-For example, the parameter set $(2,5)$ is supplied, ILU0 is computed for the diagonal block $A(2:4, 2:4)$, where $2:4 = \{2, 3, 4 \}$.
-By default, ILU0 is computed for the full system matrix.
-=======
 \TIP{The performance of level scheduling depends strongly on the matrix pattern and is thus disabled by default.}
->>>>>>> upstream/1.5.1
 
 \subsection{Block-ILU}
 To overcome the serial nature of ILUT and ILU0 applied to the full system matrix,
 a parallel variant is to apply ILU to diagonal blocks of the system matrix.
 This is accomplished by the \lstinline|block_ilu| preconditioner, which takes
 the system matrix type as first template argument and the respective ILU-tag type as second template argument
-<<<<<<< HEAD
-(either \lstinline|ilut_tag| or \lstinline|ilu0_tag|). 
-=======
 (either \lstinline|ilut_tag| or \lstinline|ilu0_tag|). Support for accelerators using {\CUDA} or {\OpenCL} is provided.
->>>>>>> upstream/1.5.1
 
 \begin{lstlisting}
 //compute block-ILU preconditioner using ILU0 for each block:
@@ -268,16 +240,10 @@ vcl_result = viennacl::linalg::solve(vcl_matrix,
                                      viennacl::linalg::bicgstab_tag(),
                                      vcl_block_ilu0);
 \end{lstlisting}
-<<<<<<< HEAD
-A third argument can be passed to the constructor of \lstinline|block_ilu_precond|: 
-Either the number of blocks to be used (defaults to 4), or an index vector with fine-grained control over the blocks. Refer to the Doxygen pages in doc/doxygen for details.
-
-=======
 A third argument can be passed to the constructor of \lstinline|block_ilu_precond|:
 Either the number of blocks to be used (defaults to $8$), or an index vector with fine-grained control over the blocks. Refer to the Doxygen pages in doc/doxygen for details.
 
 \TIP{The number of blocks is a design parameter for your sparse linear system at hand. Higher number of blocks leads to better memory bandwidth utilization on GPUs, but may increase the number of solver iterations.}
->>>>>>> upstream/1.5.1
 
 \subsection{Jacobi Preconditioner}
 A Jacobi preconditioner is a simple diagonal preconditioner given by the reciprocals of the diagonal entries of the system matrix $A$.
@@ -365,64 +331,10 @@ viennacl::linalg::lanczos_tag ltag(0.85, 15, 0, 200);
 
 \TIP{Example code can be found in \lstinline|examples/tutorial/lanczos.cpp|.}
 
-\section{Eigenvalue Computations}
-%{\ViennaCL} 
-Two algorithms for the computations of the eigenvalues of a matrix $A$ are implemented in {\ViennaCL}:
-\begin{itemize}
-\item The Power Iteration \cite{golub:matrix-computations}
-\item The Lanczos Algorithm \cite{simon:lanczos-pro}
-\end{itemize}
-Depending on the parameter \lstinline|tag| either one of them is called. 
-Both algorithms can be used for either {\ublas} or {\ViennaCL} compressed matrices.\\
-In order to get the eigenvalue with the greatest absolut value the power iteration should be called. \\
-The lanczos algorithm returns a vector of the largest eigenvalues with the same type as the entries of the matrix.
-
-The algorithms are called for a matrix object \lstinline|A| by
-\begin{lstlisting}
-std::vector<double> largest_eigenvalues = viennacl::linalg::eig(A, ltag);
-double largest_eigenvalue = viennacl::linalg::eig(A, ptag);
-\end{lstlisting}
-
-
-\subsection{Power Iteration}
-The Power iteration aims at computing the eigenvalues of a matrix by calculating the product of the matrix and a vector for several times, where the resulting vector is used for the next product of the matrix and so on. The computation stops as soon as the norm of the vector converges. \\
-The final vector is the eigenvector to the eigenvalue with the greatest absolut value.\\
-To call this algorithm, \lstinline|piter_tag| must be used.
-This tag has only one parameter: \\ \lstinline|terminationfactor| defines the accuracy of the computation, i.e. if the new norm of the eigenvector changes less than this parameter the computation stops and returns the corresponding eigenvalue (default: $1e-10$).\\
-The call of the constructor may look like the following:
-\begin{lstlisting} 
-viennacl::linalg::piter_tag ptag(1e-8);
-\end{lstlisting}
-
-\TIP{Example code can be found in \lstinline|examples/tutorial/power-iter.cpp|.}
-
-\subsection{The Lanczos Algorithm}
-In order to compute the eigenvalues of a sparse high-dimensional matrix the lanczos algorithm can be used to find these. 
-This algorithm reformulates the given high-dimensional matrix in a way such that the matrix can be rewritten in a tridiagonal matrix at much lower dimension. The eigenvalues of this tridiagonal matrix are equal to the largest eigenvalues of the original matrix. \\
-The eigenvalues of the tridiagonal matrix are calculated by using the bisection method \cite{golub:matrix-computations}. \\
-To call this lanczos algorithm, \lstinline|lanczos_tag| must be used.
-This tag has several parameters that can be passed to the constructor:
-
-\begin{itemize}
- \item The exponent of epsilon for the tolerance of the reorthogonalization, defined by the parameter \lstinline|factor| (default: $0.75$)
- \item The method of the lanczos algorithm: $0$ uses partial reorthogonalization, $1$ full reothogonalization and $2$ does not use reorthogonalization (default: $0$)
- \item The number of eigenvalues that are returned is specified by \lstinline|num_eigenvalues| (default: $10$)
- \item The size of the krylov space used for the computations can be set by the parameter \lstinline|krylov_size| (default: $100$). The maximum number of iterations can be equal or less this parameter
-\end{itemize}
-The call of the constructor may look like the following:
-\begin{lstlisting}
-viennacl::linalg::lanczos_tag ltag(0.85, 15, 0, 200);
-\end{lstlisting}
-
-\TIP{Example code can be found in \lstinline|examples/tutorial/lanczos.cpp|.}
-
 
 \section{QR Factorization}
-<<<<<<< HEAD
-=======
 
 \NOTE{The current QR factorization implementation depends on {\ublas}.}
->>>>>>> upstream/1.5.1
 
 A matrix $A \in \mathbb{R}^{n\times m}$ can be factored into $A = Q R$, where $Q \in \mathbb{R}^{n\times n}$ is an
 orthogonal matrix and $R \in \mathbb{R}^{n \times m}$ is upper triangular. This so-called QR-factorization is important for eigenvalue computations as well as
@@ -437,11 +349,7 @@ worker function \lstinline|inplace_qr|. The upper triangular matrix $R$ is direc
 \end{lstlisting}
 If $A$ is a dense matrix from \ublas, the calculation is carried out on the CPU using a single thread. If $A$ is a
 \lstinline|viennacl::matrix|, a hybrid implementation is used: The panel factorization is carried out using \ublas, while expensive BLAS level 3 operations
-<<<<<<< HEAD
-are computed on the OpenCL device using multiple threads. 
-=======
 are computed on the OpenCL device using multiple threads.
->>>>>>> upstream/1.5.1
 
 Typically, the orthogonal matrix $Q$ is kept in inplicit form because of computational efficiency
 However, if $Q$ and $R$ have to be computed explicitly, the function \lstinline|recoverQ| can be used:
@@ -449,48 +357,6 @@ However, if $Q$ and $R$ have to be computed explicitly, the function \lstinline|
   viennacl::linalg::recoverQ(A, betas, Q, R);
 \end{lstlisting}
 Here, \lstinline|A| is the inplace QR-factored matrix, \lstinline|betas| are the coefficients of the Householder reflectors as returned by
-<<<<<<< HEAD
-\lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices.
-
-
-\section{Singular Value Decomposition}
-\NOTE{Singular Value Decomposition is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
-
-\NOTE{Singular Value Decomposition in {\ViennaCLversion} is provided for a row-major matrix $A$ with single precision floating point entries (\lstinline|float|) only.}
-
-Any matrix $A$ can be factored as
-\begin{align}
- A = U \Sigma V^{\mathrm{T}}
-\end{align}
-with orthogonal matrices $U$ and $V$ and a diagonal matrix $\Sigma$ consisting of non-negative diagonal entries only.
-
-\begin{lstlisting}
- viennacl::matrix<ScalarType> A(size1, size2),
- viennacl::matrix<ScalarType> U(size1, size1),
- viennacl::matrix<ScalarType> V(size2, size2),
-
- viennacl::linalg::svd(A, U, V);
-\end{lstlisting}
-The input matrix \lstinline|A| is overwritten with $\Sigma$.
-
-\section{Nonnegative Matrix Factorization}
-\NOTE{Nonnegative Matrix Factorization is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
-
-In various fields such as text mining, a matrix $V$ needs to be factored into factors $W$ and $H$ such that the function
-\begin{align*}
- f(W, H) = \Vert V - WH \Vert_{\mathrm{F}}^2
-\end{align*}
-is minimized. The algorithm proposed by Lee and Seoung \cite{lee:nmf} is available in ViennaCL as
-\begin{lstlisting}
- viennacl::matrix<ScalarType> V(size1, size2),
- viennacl::matrix<ScalarType> W(size1, size1),
- viennacl::matrix<ScalarType> H(size2, size2),
-
- viennacl::linalg::nmf(V, W, H);
-\end{lstlisting}
-=======
 \lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices. However, the explicit formation of $Q$ is expensive and is usually avoided.
 For a number of applications of the QR factorization it is required to apply $Q^T$ to a vector $b$. This is accomplished by
 \begin{lstlisting}
@@ -499,4 +365,3 @@ For a number of applications of the QR factorization it is required to apply $Q^
 without setting up $Q$ (or $Q^T$) explicitly.
 
 \TIP{Have a look at \lstinline|examples/tutorial/least-squares.cpp| for a least-squares computation using QR factorizations.}
->>>>>>> upstream/1.5.1
diff --git a/doc/manual/changelogs.tex b/doc/manual/changelogs.tex
index 31ba2b8..1daaafe 100644
--- a/doc/manual/changelogs.tex
+++ b/doc/manual/changelogs.tex
@@ -175,27 +175,6 @@ Some of the experimental features introduced in 1.2.0 keep their experimental st
 
 
 
-\section*{Version 1.3.x}
-
-\subsection*{Version 1.3.0}
-Several new features enter this new minor version release.
-Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
-\begin{itemize}
- \item Full support for ranges and slices for dense matrices and vectors (no longer experimental)
- \item QR factorization now possible for arbitrary matrix sizes (no longer experimental)
- \item Further improved matrix-matrix multiplication performance for matrix dimensions which are a multiple of 64 (particularly improves performance for NVIDIA GPUs)
- \item Added Lanczos and power iteration method for eigenvalue computations of dense and sparse matrices (experimental, contributed by G\"unther Mader and Astrid Rupp)
- \item Added singular value decomposition in single precision (experimental, contributed by Volodymyr Kysenko)
- \item Two new ILU-preconditioners added: ILU0 (contributed by Evan Bollig) and a block-diagonal ILU preconditioner using either ILUT or ILU0 for each block. Both preconditioners are computed entirely on the CPU.
- \item Automated OpenCL kernel generator based on high-level operation specifications added (many thanks to Philippe Tillet who had a lot of \emph{fun fun fun} working on this)
- \item Two new sparse matrix types (by Volodymyr Kysenko): \lstinline|ell_matrix| for the ELL format and \lstinline|hyb_matrix| for a hybrid format (contributed by Volodymyr Kysenko).
- \item Added possibility to specify the OpenCL platform used by a context
- \item Build options for the OpenCL compiler can now be supplied to a context (thanks to Krzysztof Bzowski for the suggestion)
- \item Added nonnegative matrix factorization by Lee and Seoung (contributed by Volodymyr Kysenko).
-\end{itemize}
-
-
-
 \section*{Version 1.2.x}
 
 \subsection*{Version 1.2.1}
diff --git a/doc/manual/contributors.tex b/doc/manual/contributors.tex
index a1edba4..79f24f9 100644
--- a/doc/manual/contributors.tex
+++ b/doc/manual/contributors.tex
@@ -17,10 +17,7 @@ Karl Rupp\\
 \textit{Code Contributors:} \\
 
 Evan Bollig \\
-<<<<<<< HEAD
-=======
 Alex Christensen (BYU) \\
->>>>>>> upstream/1.5.1
 Philipp Grabenweger \\
 Volodymyr Kysenko \\
 Nikolay Lukash \\
@@ -28,10 +25,7 @@ G\"unther Mader \\
 Vittorio Patriarca \\
 Florian Rudolf \\
 Astrid Rupp \\
-<<<<<<< HEAD
-=======
 Toby St Clere Smithe \\
->>>>>>> upstream/1.5.1
 Philippe Tillet \\
 Markus Wagner \\
 Josef Weinbub \\
diff --git a/doc/manual/cover.tex b/doc/manual/cover.tex
index b902a56..b9e5fe5 100644
--- a/doc/manual/cover.tex
+++ b/doc/manual/cover.tex
@@ -2,11 +2,7 @@
 \begin{titlepage}
 
 \vspace*{3cm}
-<<<<<<< HEAD
-\Huge{ViennaCL 1.3.0} 
-=======
 \Huge{ViennaCL 1.5.1}
->>>>>>> upstream/1.5.1
 \rule[0.0cm]{9.5cm}{0.05cm}
 \begin{flushright}
 \Large{User Manual}
diff --git a/doc/manual/kernel-generation.tex b/doc/manual/kernel-generation.tex
index 4b8c4b8..f1d78a8 100644
--- a/doc/manual/kernel-generation.tex
+++ b/doc/manual/kernel-generation.tex
@@ -1,42 +1,15 @@
-<<<<<<< HEAD
-\chapter{Automated User-Kernel Generation} \label{chap:kernel-generation}
-
-While {\ViennaCL} provides a convenient means of including custom compute kernels, cf.~Chap.~\ref{chap:custom},
-=======
 \chapter{Automated OpenCL User-Kernel Generation} \label{chap:kernel-generation}
 
 While {\ViennaCL} provides a convenient means of including custom {\OpenCL} compute kernels, cf.~Chap.~\ref{chap:custom},
->>>>>>> upstream/1.5.1
 it can be rather tedious to come up with a good compute kernel, or to come up with many similar kernels differing in small details only.
 For the case of BLAS level 1 and level 2 operations, {\ViennaCL} now provides an automated kernel generator, which takes a high-level specification of the operations and create one or more suitable OpenCL kernels.
 This allows for high-performance implementations of algorithms which may otherwise lead to spurious temporary objects.
 
-<<<<<<< HEAD
-As our second example, we consider the operation
-=======
 Consider the operation
->>>>>>> upstream/1.5.1
 \begin{align*}
 \mathbf{x} = \mathbf{A} \times \bigl[ (\mathbf{y} \cdot (\mathbf{y}+\mathbf{z}))\mathbf{y} + \mathbf{z} \bigr] \ ,
 \end{align*}
 where $\mathbf{x}$, $\mathbf{y}$ and $\mathbf{z}$ denote vectors, $\mathbf{A}$ is a dense matrix, and the dot denotes the vector dot product.
-<<<<<<< HEAD
-With the proposed generator it is sufficient to write the following C++ code:
-\begin{lstlisting}
-// Instantiation of the symbolic variables
-symbolic_vector<0, NumericT> sX;
-symbolic_matrix<1, NumericT> sA;
-symbolic_vector<2, NumericT> sY;
-symbolic_vector<3, NumericT> sZ;
-
-//Creation of the custom operation
-custom_operation my_op(
- sX = prod(sA, inner_prod(sY, sY+sZ) * sY + sZ)
-                      );
-\end{lstlisting}
-where \lstinline|NumericT| is either \lstinline|float| or \lstinline|double|.
-The custom operation object \lstinline|my_op| can then be enqueued like any other kernel:
-=======
 With the generator it is sufficient to write the following C++ code in order to obtain an OpenCL kernel:
 \begin{lstlisting}
 // Instantiation of the symbolic variables
@@ -54,18 +27,13 @@ The string provided as second parameter is required and can be used to identify,
 No two \lstinline|custom_operation|s are allowed to be identified using the same string.
 
 The custom operation object \lstinline|my_op| can be enqueued like any other kernel:
->>>>>>> upstream/1.5.1
 \begin{lstlisting}
 //Execution of the custom operation
 viennacl::ocl::enqueue(my_op(x,A,y,z));
 \end{lstlisting}
 Here, \lstinline|x|, \lstinline|y|, \lstinline|z| are of type \lstinline|viennacl::vector<NumericT>| and \lstinline|A| is of type \lstinline|viennacl::matrix<NumericT>|.
 
-<<<<<<< HEAD
-\TIP{Sample code can be found in \lstinline|tests/src/generator_*.cpp|}
-=======
 \TIP{Sample code can be found in \lstinline|tests/src/generator_*.cpp|}
 
 \NOTE{ The kernel generator is still experimental, yet already able to generate rather complex compute kernels. }
 
->>>>>>> upstream/1.5.1
diff --git a/doc/manual/multi-device.tex b/doc/manual/multi-device.tex
index e441ebf..f8ce6d3 100644
--- a/doc/manual/multi-device.tex
+++ b/doc/manual/multi-device.tex
@@ -21,11 +21,7 @@ This default context is identified by the ID $0$ (of type \lstinline|long|).
 If a different platform should be used on a machine with multiple platforms available,
 this can be achieved with
 \begin{lstlisting}
-<<<<<<< HEAD
- viennacl::ocl::setup_context_platform_index(id, platform_index);
-=======
  viennacl::ocl::set_context_platform_index(id, platform_index);
->>>>>>> upstream/1.5.1
 \end{lstlisting}
 where the context ID is \lstinline|id| and \lstinline|platform_index| refers to the array index of the platform as returned by \lstinline|clGetPlatformIDs()|.
 
@@ -34,13 +30,8 @@ By default, only the first device in the context is used for all operations. Thi
  viennacl::ocl::current_context().current_device();
  viennacl::ocl::current_device(); //equivalent to above
 \end{lstlisting}
-<<<<<<< HEAD
-A user may wish to use multiple contexts, where each context consists of a subset of the available devices. 
-To setup a context with ID \lstinline|id| with a particular device type only, the user has to specify this 
-=======
 A user may wish to use multiple {\OpenCL} contexts, where each context consists of a subset of the available devices.
 To setup a context with ID \lstinline|id| with a particular device type only, the user has to specify this
->>>>>>> upstream/1.5.1
 prior to any other {\ViennaCL} related statements:
 \begin{lstlisting}
 //use only GPUs:
@@ -100,19 +91,11 @@ If the supplied device is not part of the context, an error message is printed a
 
 
 \section{Setting OpenCL Compiler Flags}
-<<<<<<< HEAD
-Each context provides a member function \lstinline|.build_options()|, which can be used to pass OpenCL compiler flags prior to compilation.
-=======
 Each {\OpenCL} context provides a member function \lstinline|.build_options()|, which can be used to pass OpenCL compiler flags prior to compilation.
->>>>>>> upstream/1.5.1
 Note that flags need to be passed to the context prior to the compilation of the respective kernels, i.e.~prior the first instantiation of the respective matrix or vector types.
 
 To pass the \lstinline|-cl-mad-enable| flag to the current context, the line
 \begin{lstlisting}
  viennacl::ocl::current_context().build_options("-cl-mad-enable");
 \end{lstlisting}
-<<<<<<< HEAD
 is sufficient. Confer to the {\OpenCL} standard for a full list of flags.
-=======
-is sufficient. Confer to the {\OpenCL} standard for a full list of flags.
->>>>>>> upstream/1.5.1
diff --git a/doc/manual/types.tex b/doc/manual/types.tex
index 8c1edfc..ecb31a9 100644
--- a/doc/manual/types.tex
+++ b/doc/manual/types.tex
@@ -358,11 +358,7 @@ The use of \texttt{coordinate\_matrix$<$T, alignment$>$} is similar to that of t
 The interface is described in Tab.~\ref{tab:coordinate-matrix-interface}.
 
 %\TIP{In {\ViennaCLversion} the use of \lstinline|compressed\_matrix| over \lstinline|coordinate\_matrix| is encouraged due to better performance!}
-<<<<<<< HEAD
-\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|coordinate_matrix| yet.}
-=======
 \NOTE{Note that only a few preconditioners work with \lstinline|coordinate_matrix| so far, cf.~ Sec.~\ref{sec:preconditioner}.}
->>>>>>> upstream/1.5.1
 
 
 \subsection{ELL Matrix}
@@ -380,12 +376,6 @@ For an example use of an \lstinline|ell_matrix|, have a look at \lstinline|examp
 \subsection{Hybrid Matrix}
 The higher performance of the ELL format for matrices with approximately the same number of entries per row
 and the higher flexibility of the CSR format is combined in the \lstinline|hyb_matrix| type, where the main part of the system matrix is stored in ELL format and excess entries are stored in CSR format.
-<<<<<<< HEAD
-
-For an example use of an \lstinline|hyb_matrix|, have a look at \lstinline|examples/benchmarks/sparse.cpp|.
-
-\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|hyb_matrix| yet.}
-=======
 
 For an example use of an \lstinline|hyb_matrix|, have a look at \lstinline|examples/benchmarks/sparse.cpp|.
 
@@ -398,7 +388,6 @@ An additional array is used to store the global row index $r$ in the sparse matr
 
 \NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|compressed_compressed_matrix| yet.}
 
->>>>>>> upstream/1.5.1
 
 \section{Proxies}
 Similar to {\ublas}, {\ViennaCL} provides \lstinline|range| and \lstinline|slice| objects in order to conveniently manipulate dense submatrices and vectors. The functionality is
@@ -438,11 +427,7 @@ The proxy objects can now be manipulated in the same way as vectors and dense ma
 additions work as usual, e.g.
 \begin{lstlisting}
  vcl_sub += vcl_sub; //or project(v, r) += project(v, r);
-<<<<<<< HEAD
- M_sub += M_sub;     //or project(M, r, r) += project(M, r, r);
-=======
  M_sub   += M_sub;   //or project(M, r, r) += project(M, r, r);
->>>>>>> upstream/1.5.1
 \end{lstlisting}
  Submatrix-Submatrix products are computed in the same manner and are handy for many block-based linear algebra algorithms.
 
diff --git a/doc/manual/viennacl.bib b/doc/manual/viennacl.bib
index cf9fd32..bdda2c3 100644
--- a/doc/manual/viennacl.bib
+++ b/doc/manual/viennacl.bib
@@ -172,8 +172,4 @@
  booktitle = {Advances in Neural Information Processing Systems 13},
  pages = {556–562},
  year = {2000},
-<<<<<<< HEAD
-} 
-=======
 }
->>>>>>> upstream/1.5.1
diff --git a/doc/manual/viennacl.tex b/doc/manual/viennacl.tex
index 5a39d16..85c60f8 100644
--- a/doc/manual/viennacl.tex
+++ b/doc/manual/viennacl.tex
@@ -61,13 +61,8 @@
 \newcommand{\OpenCL} {\texttt{OpenCL}}
 \newcommand{\CUDA} {\texttt{CUDA}}
 \newcommand{\ViennaCL} {\texttt{ViennaCL}}
-<<<<<<< HEAD
-\newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.3.0}}
-\newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.3.x}}
-=======
 \newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.5.1}}
 \newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.5.x}}
->>>>>>> upstream/1.5.1
 \newcommand{\Boost} {\texttt{Boost}}
 \newcommand{\ublas} {\texttt{uBLAS}}
 \newcommand{\Eigen} {\texttt{Eigen}}
@@ -138,12 +133,6 @@ library users are advised to use them with extra care and be prepared for interf
 \include{multi-device}
 \include{custom-kernels}
 \include{custom-contexts}
-<<<<<<< HEAD
-\include{kernel-generation}
-\include{tuning}
-\include{other-libs}
-\include{benchmarks}
-=======
 %\include{kernel-generation}
 %\include{tuning}
 \include{structured-matrices}
@@ -152,7 +141,6 @@ library users are advised to use them with extra care and be prepared for interf
 %%%%%%%%%%%%%%% Addon Functionality %%%%%%%%%%%%%%%%
 
 \part{Miscellaneous}
->>>>>>> upstream/1.5.1
 \include{design}
 
 % Appendix
diff --git a/examples/benchmarks/CMakeLists.txt b/examples/benchmarks/CMakeLists.txt
index 242acf3..0e880c7 100644
--- a/examples/benchmarks/CMakeLists.txt
+++ b/examples/benchmarks/CMakeLists.txt
@@ -3,16 +3,6 @@ foreach(bench blas3 copy scheduler vector)
    add_executable(${bench}bench-cpu ${bench}.cpp)
 endforeach()
 
-<<<<<<< HEAD
-if(ENABLE_UBLAS)
-   include_directories(${Boost_INCLUDE_DIRS})
-   foreach(bench sparse solver)
-      add_executable(${bench}bench ${bench}.cpp)
-      target_link_libraries(${bench}bench ${OPENCL_LIBRARIES})
-   endforeach()
-endif()
-
-=======
 if (ENABLE_UBLAS)
     include_directories(${Boost_INCLUDE_DIRS})
     foreach(bench sparse solver)
@@ -62,7 +52,6 @@ if (ENABLE_CUDA)
 endif (ENABLE_CUDA)
 
 
->>>>>>> upstream/1.5.1
 # IF(CMAKE_COMPILER_IS_GNUCXX)
    #ADD_DEFINITIONS(-Wall -pedantic -O0 -g)
 #   ADD_DEFINITIONS(-Wall -pedantic -O3)
diff --git a/examples/benchmarks/blas3.cpp b/examples/benchmarks/blas3.cpp
index 95e0447..128c8b8 100644
--- a/examples/benchmarks/blas3.cpp
+++ b/examples/benchmarks/blas3.cpp
@@ -1,205 +1,3 @@
-<<<<<<< HEAD
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//disable debug mechanisms to have a fair benchmark environment
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/matrix_proxy.hpp"
-
-// Some helper functions for this tutorial:
-#include "../tutorial/Random.hpp"
-
-
-#include "benchmark-utils.hpp"
-
-/*
-*   Tutorial: BLAS level 3 functionality
-*   
-*/
-
-#define BLAS3_MATRIX_SIZE   1024
-
-template<typename ScalarType>
-int run_benchmark()
-{
-  Timer timer;
-  double exec_time;
-
-  //
-  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
-  //
-  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
-    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
-      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();
-
-  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
-    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
-      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();
-
-  //
-  // Set up some ViennaCL objects
-  //
-  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
-  //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
-  //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
-  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix-matrix products /////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
-  //
-  
-  std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
-  
-  
-  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
-    std::cout << std::endl;
-  }
-
-  std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;
-
-  viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
-    std::cout << std::endl;
-  }
-
-  std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;
-
-  viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
-    std::cout << std::endl;
-  }
-
-  return EXIT_SUCCESS;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Dense Matrix-Matrix product " << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -446,4 +244,3 @@ int main()
   }
   return 0;
 }
->>>>>>> upstream/1.5.1
diff --git a/examples/benchmarks/solver.cpp b/examples/benchmarks/solver.cpp
index 09c6093..5a27a3c 100644
--- a/examples/benchmarks/solver.cpp
+++ b/examples/benchmarks/solver.cpp
@@ -1,456 +1,3 @@
-<<<<<<< HEAD
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-
-#define VIENNACL_HAVE_UBLAS 1
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/ell_matrix.hpp"
-#include "viennacl/hyb_matrix.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/row_scaling.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-#include "io.hpp"
-
-
-using namespace boost::numeric;
-
-/*
-*   Benchmark:
-*   Iterative solver tests
-*   
-*/
-
-#define BENCHMARK_RUNS          1
-
-
-template <typename ScalarType>
-ScalarType diff_inf(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   for (unsigned int i=0;i<v1.size(); ++i)
-   {
-      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
-         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
-      else
-         v2_cpu[i] = 0.0;
-   }
-
-   return norm_inf(v2_cpu);
-}
-
-template <typename ScalarType>
-ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   return norm_2(v1 - v2_cpu) / norm_2(v1);
-}
-
-
-template <typename MatrixType, typename VectorType, typename SolverTag, typename PrecondTag>
-void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType const & ref_result, SolverTag const & solver, PrecondTag const & precond, long ops)
-{
-  Timer timer;
-  VectorType result(rhs);
-  VectorType residual(rhs);
-  viennacl::ocl::get_queue().finish();
-  
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    result = viennacl::linalg::solve(matrix, rhs, solver, precond);
-  }
-  viennacl::ocl::get_queue().finish();
-  double exec_time = timer.get();
-  std::cout << "Exec. time: " << exec_time << std::endl;
-  std::cout << "Est. "; printOps(ops, exec_time / BENCHMARK_RUNS);
-  residual -= viennacl::linalg::prod(matrix, result);
-  std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
-  std::cout << "Estimated rel. residual: " << solver.error() << std::endl;
-  std::cout << "Iterations: " << solver.iters() << std::endl;
-  result -= ref_result;
-  std::cout << "Relative deviation from result: " << viennacl::linalg::norm_2(result) / viennacl::linalg::norm_2(ref_result) << std::endl;
-}
-
-
-template<typename ScalarType>
-int run_benchmark()
-{
-  
-  Timer timer;
-  double exec_time;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  ublas::vector<ScalarType> ublas_vec1;
-  ublas::vector<ScalarType> ublas_vec2;
-  ublas::vector<ScalarType> ublas_result;
-  unsigned int solver_iters = 20;
-  unsigned int solver_krylov_dim = 20;
-  double solver_tolerance = 1e-6;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/rhs65025.txt", ublas_vec1))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/rhs65025.txt", ublas_vec1))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading rhs" << std::endl;
-  ublas_vec2 = ublas_vec1;
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_result))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_result))
-  #endif
-  {
-    std::cout << "Error reading result file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading result" << std::endl;
-  
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec1.size(), ublas_vec1.size());
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix(ublas_vec1.size(), ublas_vec1.size());
-  viennacl::ell_matrix<ScalarType> vcl_ell_matrix(ublas_vec1.size(), ublas_vec1.size());
-  viennacl::hyb_matrix<ScalarType> vcl_hyb_matrix(ublas_vec1.size(), ublas_vec1.size());
-
-  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
-  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
-  viennacl::vector<ScalarType> vcl_result(ublas_vec1.size()); 
-  
-
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return EXIT_FAILURE;
-  }
-  //unsigned int cg_mat_size = cg_mat.size(); 
-  std::cout << "done reading matrix" << std::endl;
-  
-  //cpu to gpu:
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
-  viennacl::copy(ublas_matrix, vcl_ell_matrix);
-  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
-  viennacl::copy(ublas_vec1, vcl_vec1);
-  viennacl::copy(ublas_vec2, vcl_vec2);
-  viennacl::copy(ublas_result, vcl_result);
-  
-  
-  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
-  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
-  
-  viennacl::linalg::row_scaling< ublas::compressed_matrix<ScalarType> >    ublas_row_scaling(ublas_matrix, viennacl::linalg::row_scaling_tag(1));
-  viennacl::linalg::row_scaling< viennacl::compressed_matrix<ScalarType> > vcl_row_scaling(vcl_compressed_matrix, viennacl::linalg::row_scaling_tag(1));
-  
-  ///////////////////////////////////////////////////////////////////////////////
-  //////////////////////           ILU preconditioner         //////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  std::cout << "------- ILU0 on CPU (ublas) ----------" << std::endl;
-
-  timer.start();
-  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  exec_time = timer.get();
-  std::cout << "Setup time: " << exec_time << std::endl;
-  
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    ublas_ilu0.apply(ublas_vec1);
-  }
-  exec_time = timer.get();
-  std::cout << "ublas time: " << exec_time << std::endl;
-  
-  std::cout << "------- ILU0 with ViennaCL ----------" << std::endl;
-
-  timer.start();
-  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
-  exec_time = timer.get();
-  std::cout << "Setup time: " << exec_time << std::endl;
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_ilu0.apply(vcl_vec1);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "ViennaCL time: " << exec_time << std::endl;
-  
-  
-  std::cout << "------- ILUT on CPU (ublas) ----------" << std::endl;
-
-  timer.start();
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  exec_time = timer.get();
-  std::cout << "Setup time: " << exec_time << std::endl;
-  
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    ublas_ilut.apply(ublas_vec1);
-  }
-  exec_time = timer.get();
-  std::cout << "ublas time: " << exec_time << std::endl;
-
-  std::cout << "------- ILUT with ViennaCL ----------" << std::endl;
-
-  timer.start();
-  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
-  exec_time = timer.get();
-  std::cout << "Setup time: " << exec_time << std::endl;
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_ilut.apply(vcl_vec1);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "ViennaCL time: " << exec_time << std::endl;
-  
-  ///////////////////////////////////////////////////////////////////////////////
-  //////////////////////              CG solver                //////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  long cg_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + 6 * ublas_vec2.size()));
-  
-  viennacl::linalg::cg_tag cg_solver(solver_tolerance, solver_iters);
-  
-  std::cout << "------- CG solver (no preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
-  
-  std::cout << "------- CG solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
- 
-  std::cout << "------- CG solver (no preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
-
-  std::cout << "------- CG solver (no preconditioner) via ViennaCL, ell_matrix ----------" << std::endl;
-  run_solver(vcl_ell_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
-
-  std::cout << "------- CG solver (no preconditioner) via ViennaCL, hyb_matrix ----------" << std::endl;
-  run_solver(vcl_hyb_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
-  
-
-  std::cout << "------- CG solver (ILU0 preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilu0, cg_ops);
-
-  std::cout << "------- CG solver (ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilu0, cg_ops);
-  
-  std::cout << "------- CG solver (ILUT preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilut, cg_ops);
-  
-  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
-  
-//  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
-  
-  
-  std::cout << "------- CG solver (Jacobi preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_jacobi, cg_ops);
-  
-  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi, cg_ops);
-  
-//  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi, cg_ops);
-  
-  
-  std::cout << "------- CG solver (row scaling preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_row_scaling, cg_ops);
-  
-  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling, cg_ops);
-  
-//  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling, cg_ops);
-  
-  ///////////////////////////////////////////////////////////////////////////////
-  //////////////////////           BiCGStab solver             //////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  
-  long bicgstab_ops = static_cast<long>(solver_iters * (2 * ublas_matrix.nnz() + 13 * ublas_vec2.size()));
-  
-  viennacl::linalg::bicgstab_tag bicgstab_solver(solver_tolerance, solver_iters);
-                                                                             
-  std::cout << "------- BiCGStab solver (no preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
-  
-//  std::cout << "------- BiCGStab solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, bicgstab_ops);
-
-  
-  std::cout << "------- BiCGStab solver (ILUT preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_ilut, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
-  
-//  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (Jacobi preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_jacobi, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi, bicgstab_ops);
-  
-//  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (row scaling preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_row_scaling, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling, bicgstab_ops);
-  
-//  std::cout << "------- CG solver row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling, bicgstab_ops);
-
-  ///////////////////////////////////////////////////////////////////////////////
-  ///////////////////////            GMRES solver             ///////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  
-  long gmres_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + (solver_iters * 2 + 7) * ublas_vec2.size()));
-  
-  viennacl::linalg::gmres_tag gmres_solver(solver_tolerance, solver_iters, solver_krylov_dim);
-  
-  std::cout << "------- GMRES solver (no preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
-  
-  std::cout << "------- GMRES solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
-  
-//  std::cout << "------- GMRES solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, bicgstab_ops);
-
-  
-  std::cout << "------- GMRES solver (ILUT preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_ilut, gmres_ops);
-  
-  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
-  
-//  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
-
-
-  std::cout << "------- GMRES solver (Jacobi preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_jacobi, gmres_ops);
-  
-  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi, gmres_ops);
-  
-//  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi, gmres_ops);
-  
-  
-  std::cout << "------- GMRES solver (row scaling preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_row_scaling, gmres_ops);
-  
-  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling, gmres_ops);
-  
-//  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling, gmres_ops);
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << "---------------------------------------------------------------------------" << std::endl;
-  std::cout << "---------------------------------------------------------------------------" << std::endl;
-  std::cout << " Benchmark for Execution Times of Iterative Solvers provided with ViennaCL " << std::endl;
-  std::cout << "---------------------------------------------------------------------------" << std::endl;
-  std::cout << " Note that the purpose of this benchmark is not to run solvers until" << std::endl;
-  std::cout << " convergence. Instead, only the execution times of a few iterations are" << std::endl;
-  std::cout << " recorded. Residual errors are only printed for information." << std::endl << std::endl;
-   
-
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Solver" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs with Stream SDK v2.1." << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -1096,4 +643,3 @@ int main()
   return 0;
 }
 
->>>>>>> upstream/1.5.1
diff --git a/examples/benchmarks/sparse.cpp b/examples/benchmarks/sparse.cpp
index c458069..ce03884 100644
--- a/examples/benchmarks/sparse.cpp
+++ b/examples/benchmarks/sparse.cpp
@@ -1,308 +1,3 @@
-<<<<<<< HEAD
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//#define VIENNACL_BUILD_INFO
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#define VIENNACL_HAVE_UBLAS 1
-
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/ell_matrix.hpp"
-#include "viennacl/hyb_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-#include "io.hpp"
-
-
-/*
-*   Benchmark 1:
-*   Vector tests
-*   
-*/
-
-#define BENCHMARK_RUNS          10
-
-
-template<typename ScalarType>
-int run_benchmark()
-{   
-   Timer timer;
-   double exec_time;
-   
-   ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = ScalarType(3.1415);
-  ScalarType std_factor2 = ScalarType(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
-  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_vec1))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading rhs" << std::endl;
-  ublas_vec2 = ublas_vec1;
-  
-  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
-  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
-  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
-  
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
-
-  viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
-  viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
-
-  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //unsigned int cg_mat_size = cg_mat.size(); 
-  std::cout << "done reading matrix" << std::endl;
-  
-  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
-  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
-  viennacl::vector<ScalarType> vcl_vec3(ublas_vec1.size()); 
-  
-  //cpu to gpu:
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
-  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
-  #endif
-  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
-  viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
-  viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
-  viennacl::copy(ublas_vec1, vcl_vec1);
-  viennacl::copy(ublas_vec2, vcl_vec2);
-
-  
-  ///////////// Matrix operations /////////////////
-  
-  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    ublas_vec1 = prod(ublas_matrix, ublas_vec2);
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << ublas_vec1[0] << std::endl;
-  
-  
-  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
-  
-  
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
-  std_result = 0.0;
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align1: " << exec_time << std::endl;
-  std::cout << "GPU align1 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align4: " << exec_time << std::endl;
-  std::cout << "GPU align4 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align8: " << exec_time << std::endl;
-  std::cout << "GPU align8 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-  
-  
-  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
-  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  
-  viennacl::copy(vcl_vec1, ublas_vec2);  
-  long err_cnt = 0;
-  for (size_t i=0; i<ublas_vec1.size(); ++i)
-  {
-    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
-    {
-      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
-      ++err_cnt;
-      if (err_cnt > 5)
-        break;
-    }
-  }
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  
-  std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
-  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  
-  viennacl::copy(vcl_vec1, ublas_vec2);  
-  err_cnt = 0;
-  for (size_t i=0; i<ublas_vec1.size(); ++i)
-  {
-    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
-    {
-      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
-      ++err_cnt;
-      if (err_cnt > 5)
-        break;
-    }
-  }
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  
-  std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
-  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  
-  viennacl::copy(vcl_vec1, ublas_vec2);  
-  err_cnt = 0;
-  for (size_t i=0; i<ublas_vec1.size(); ++i)
-  {
-    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
-    {
-      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
-      ++err_cnt;
-      if (err_cnt > 5)
-        break;
-    }
-  }
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-  
-  
-  return EXIT_SUCCESS;
-}
-
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Sparse" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -625,4 +320,3 @@ int main()
   return 0;
 }
 
->>>>>>> upstream/1.5.1
diff --git a/examples/tutorial/CMakeLists.txt b/examples/tutorial/CMakeLists.txt
index 75fb4d0..eacf4eb 100644
--- a/examples/tutorial/CMakeLists.txt
+++ b/examples/tutorial/CMakeLists.txt
@@ -14,11 +14,7 @@ endforeach()
 
 if(ENABLE_UBLAS)
    include_directories(${Boost_INCLUDE_DIRS})
-<<<<<<< HEAD
-   foreach(tut amg blas2 blas3 iterative iterative-ublas lanczos matrix-range power-iter qr spai sparse structured-matrices vector-range)
-=======
    foreach(tut blas2 blas3 iterative-ublas lanczos least-squares matrix-range power-iter qr sparse vector-range)
->>>>>>> upstream/1.5.1
       add_executable(${tut} ${tut}.cpp)
       target_link_libraries(${tut} ${Boost_LIBRARIES})
       if (ENABLE_OPENCL)
@@ -26,9 +22,6 @@ if(ENABLE_UBLAS)
         set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
       endif (ENABLE_OPENCL)
    endforeach()
-   
-   target_link_libraries(lanczos ${OPENCL_LIBRARIES} boost_system)
-   target_link_libraries(power-iter ${OPENCL_LIBRARIES} boost_system)
 endif()
 
 if(ENABLE_EIGEN)
diff --git a/examples/tutorial/iterative-ublas.cpp b/examples/tutorial/iterative-ublas.cpp
index 2e07dc7..ad6a87e 100644
--- a/examples/tutorial/iterative-ublas.cpp
+++ b/examples/tutorial/iterative-ublas.cpp
@@ -1,171 +1,3 @@
-<<<<<<< HEAD
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// Necessary to obtain a suitable performance in ublas
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-/*
-*
-*   Tutorial:  Iterative solvers without OpenCL
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs;
-  ublas::vector<ScalarType> rhs2;
-  ublas::vector<ScalarType> ref_result;
-  ublas::vector<ScalarType> result;
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
-  //
-  // Read system from file
-  //
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading matrix" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-  #else
-  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading rhs" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading result" << std::endl;
-
-  
-  //
-  // set up ILUT preconditioners for ViennaCL and ublas objects. Other preconditioners can also be used (see manual)
-  // 
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
-                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  
-  //
-  // Conjugate gradient solver:
-  //
-  std::cout << "----- CG Test -----" << std::endl;
-
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
-  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
-  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilu0);
-  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_block_ilu0);
-  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
-  
-  //
-  // Stabilized BiConjugate gradient solver:
-  //
-  std::cout << "----- BiCGStab Test -----" << std::endl;
-
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilu0); //with preconditioner
-  
-  //
-  // GMRES solver:
-  //
-  std::cout << "----- GMRES Test -----" << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilu0);//with preconditioner
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -322,4 +154,3 @@ int main()
   return 0;
 }
 
->>>>>>> upstream/1.5.1
diff --git a/examples/tutorial/iterative.cpp b/examples/tutorial/iterative.cpp
index 1567130..1efde9d 100644
--- a/examples/tutorial/iterative.cpp
+++ b/examples/tutorial/iterative.cpp
@@ -1,249 +1,3 @@
-<<<<<<< HEAD
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// Necessary to obtain a suitable performance in ublas
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-
-/*
-*
-*   Tutorial:  Iterative solvers
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs;
-  ublas::vector<ScalarType> rhs2;
-  ublas::vector<ScalarType> ref_result;
-  ublas::vector<ScalarType> result;
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
-  //
-  // Read system from file
-  //
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading matrix" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-  #else
-  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading rhs" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading result" << std::endl;
-
-  //
-  // Set up some ViennaCL objects
-  //
-  size_t vcl_size = rhs.size();
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
-  viennacl::vector<ScalarType> vcl_rhs(vcl_size); 
-  viennacl::vector<ScalarType> vcl_result(vcl_size);
-  viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
-  
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
-  
-  
-  //
-  // Transfer ublas-matrix to GPU:
-  //
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  
-  //
-  // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
-  //
-  std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
-  for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
-       iter1 != ublas_matrix.end1();
-       ++iter1)
-  {
-    for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
-         iter2 != iter1.end();
-         ++iter2)
-         stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
-  }
-  viennacl::copy(stl_matrix, vcl_coordinate_matrix);
-  viennacl::copy(vcl_coordinate_matrix, stl_matrix);
-  
-  //
-  // set up ILUT preconditioners for ublas and ViennaCL objects:
-  // 
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
-                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
-  
-  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
-  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
-  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
-                                       viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
-
-  //
-  // set up Jacobi preconditioners for ViennaCL and ublas objects:
-  // 
-  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
-  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
-  
-  //
-  // Conjugate gradient solver:
-  //
-  std::cout << "----- CG Test -----" << std::endl;
-  
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
-
-  
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
-  
-  //
-  // Stabilized BiConjugate gradient solver:
-  //
-  std::cout << "----- BiCGStab Test -----" << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
-
-  
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
-  
-  //
-  // GMRES solver:
-  //
-  std::cout << "----- GMRES Test -----" << std::endl;
-  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs when using Stream SDK v2.1." << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
-
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
-
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -479,4 +233,3 @@ int main()
   return 0;
 }
 
->>>>>>> upstream/1.5.1
diff --git a/examples/tutorial/lanczos.cpp b/examples/tutorial/lanczos.cpp
index aabec25..9ac94e9 100644
--- a/examples/tutorial/lanczos.cpp
+++ b/examples/tutorial/lanczos.cpp
@@ -1,39 +1,26 @@
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-=======
 /*
 *
 *   Tutorial: Calculation of eigenvalues using Lanczos' method (lanczos.cpp and lanczos.cu are identical, the latter being required for compilation using CUDA nvcc)
 *
 */
 
->>>>>>> upstream/1.5.1
 // include necessary system headers
 #include <iostream>
 
@@ -41,11 +28,7 @@
   #define NDEBUG
 #endif
 
-<<<<<<< HEAD
-#define VIENNACL_HAVE_UBLAS
-=======
 #define VIENNACL_WITH_UBLAS
->>>>>>> upstream/1.5.1
 
 //include basic scalar and vector types of ViennaCL
 #include "viennacl/scalar.hpp"
@@ -66,21 +49,10 @@
 #include <boost/numeric/ublas/matrix_expression.hpp>
 #include <boost/numeric/ublas/matrix_sparse.hpp>
 #include <boost/numeric/ublas/vector.hpp>
-<<<<<<< HEAD
-#include <boost/numeric/ublas/operation.hpp> 
-#include <boost/numeric/ublas/vector_expression.hpp>
-#include <boost/filesystem.hpp>
-
-
-/*
-*   Tutorial: calculation of eigenvalues - lanczos and poweriteration
-*/
-=======
 #include <boost/numeric/ublas/operation.hpp>
 #include <boost/numeric/ublas/vector_expression.hpp>
 
 
->>>>>>> upstream/1.5.1
 
 template <typename MatrixType>
 std::vector<double> initEig(MatrixType const & A)
@@ -88,15 +60,9 @@ std::vector<double> initEig(MatrixType const & A)
   viennacl::linalg::lanczos_tag ltag(0.75, 10, viennacl::linalg::lanczos_tag::partial_reorthogonalization, 1700);
   std::vector<double> lanczos_eigenvalues = viennacl::linalg::eig(A, ltag);
   for(std::size_t i = 0; i< lanczos_eigenvalues.size(); i++){
-<<<<<<< HEAD
-          std::cout << "Eigenvalue " << i+1 << ": " << std::setprecision(10) << lanczos_eigenvalues[i] << std::endl; 
-  }
-  
-=======
           std::cout << "Eigenvalue " << i+1 << ": " << std::setprecision(10) << lanczos_eigenvalues[i] << std::endl;
   }
 
->>>>>>> upstream/1.5.1
   return lanczos_eigenvalues;
 }
 
@@ -104,29 +70,16 @@ std::vector<double> initEig(MatrixType const & A)
 int main()
 {
   typedef double     ScalarType;
-<<<<<<< HEAD
-  
-  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
-
-  viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());  
-  viennacl::copy(ublas_A, vcl_A);
-  
-=======
 
   boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
 
->>>>>>> upstream/1.5.1
   if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
   {
     std::cout << "Error reading Matrix file" << std::endl;
     return 0;
   }
-<<<<<<< HEAD
-  
-=======
 
   std::cout << "Running Lanczos algorithm (this might take a while)..." << std::endl;
->>>>>>> upstream/1.5.1
   std::vector<double> eigenvalues = initEig(ublas_A);
 }
 
diff --git a/examples/tutorial/power-iter.cpp b/examples/tutorial/power-iter.cpp
index 20d09ff..3028ca7 100644
--- a/examples/tutorial/power-iter.cpp
+++ b/examples/tutorial/power-iter.cpp
@@ -1,32 +1,20 @@
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-=======
 /*
 *
 *   Tutorial: Calculation of the eigenvalue with largest modulus using the power iteration method
@@ -34,7 +22,6 @@
 *
 */
 
->>>>>>> upstream/1.5.1
 // include necessary system headers
 #include <iostream>
 
@@ -42,11 +29,7 @@
   #define NDEBUG
 #endif
 
-<<<<<<< HEAD
-#define VIENNACL_HAVE_UBLAS
-=======
 #define VIENNACL_WITH_UBLAS
->>>>>>> upstream/1.5.1
 
 //include basic scalar and vector types of ViennaCL
 #include "viennacl/scalar.hpp"
@@ -66,30 +49,15 @@
 #include <boost/numeric/ublas/matrix_expression.hpp>
 #include <boost/numeric/ublas/matrix_sparse.hpp>
 #include <boost/numeric/ublas/vector.hpp>
-<<<<<<< HEAD
-#include <boost/numeric/ublas/operation.hpp> 
-#include <boost/numeric/ublas/vector_expression.hpp>
-#include <boost/filesystem.hpp>
-
-
-/*
-*   Tutorial: Power Iteration for finding the eigenvalue with largest modulus
-*/
-=======
 #include <boost/numeric/ublas/operation.hpp>
 #include <boost/numeric/ublas/vector_expression.hpp>
->>>>>>> upstream/1.5.1
 
 
 
 int main()
 {
   typedef double     ScalarType;
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/1.5.1
   boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
 
   if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
@@ -97,27 +65,15 @@ int main()
     std::cout << "Error reading Matrix file" << std::endl;
     return 0;
   }
-<<<<<<< HEAD
-  
-  viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());  
-  viennacl::copy(ublas_A, vcl_A);
-  
-  viennacl::linalg::power_iter_tag ptag(1e-8);
-=======
 
   viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());
   viennacl::copy(ublas_A, vcl_A);
 
   viennacl::linalg::power_iter_tag ptag(1e-6);
->>>>>>> upstream/1.5.1
 
   std::cout << "Starting computation of eigenvalue with largest modulus (might take about a minute)..." << std::endl;
   std::cout << "Result of power iteration with ublas matrix (single-threaded): " << viennacl::linalg::eig(ublas_A, ptag) << std::endl;
   std::cout << "Result of power iteration with ViennaCL (OpenCL accelerated): " << viennacl::linalg::eig(vcl_A, ptag) << std::endl;
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/1.5.1
 }
 
diff --git a/examples/tutorial/qr.cpp b/examples/tutorial/qr.cpp
index c143737..7b7bc77 100644
--- a/examples/tutorial/qr.cpp
+++ b/examples/tutorial/qr.cpp
@@ -98,11 +98,7 @@ int main (int, const char **)
 
   std::size_t rows = 113;   //number of rows in the matrix
   std::size_t cols = 54;   //number of columns
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/1.5.1
   //
   // Create matrices with some data
   //
@@ -155,11 +151,7 @@ int main (int, const char **)
   MatrixType ublas_QR = prod(Q, R);
   double ublas_error = check(ublas_QR, ublas_A_backup);
   std::cout << "Max rel error (ublas): " << ublas_error << std::endl;
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/1.5.1
   //
   // QR factorization in ViennaCL using Boost.uBLAS for the panel factorization
   //
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b8114fd..dfc29ab 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,13 +1,3 @@
-<<<<<<< HEAD
-foreach(PROG blas3 blas3range fft iterators
-        generator_inner_product generator_matrix generator_matrix_vector_product generator_vector
-        matrix matrix_range matrix_slice nmf
-        scalar sparse structured-matrices svd
-        vector vector_range vector_slice)
-   add_executable(${PROG}-test src/${PROG}.cpp)
-   target_link_libraries(${PROG}-test ${OPENCL_LIBRARIES})
-   add_test(${PROG} ${PROG}-test)
-=======
 
 include_directories(${Boost_INCLUDE_DIRS})
 
@@ -23,7 +13,6 @@ foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_do
    add_executable(${PROG}-test-cpu src/${PROG}.cpp)
    target_link_libraries(${PROG}-test-cpu ${Boost_LIBRARIES})
    add_test(${PROG}-cpu ${PROG}-test-cpu)
->>>>>>> upstream/1.5.1
 endforeach(PROG)
 
 
diff --git a/tests/src/blas3_solve_double.cpp b/tests/src/blas3_solve_double.cpp
index be81971..e063f79 100644
--- a/tests/src/blas3_solve_double.cpp
+++ b/tests/src/blas3_solve_double.cpp
@@ -88,16 +88,10 @@ template <typename ScalarType, typename VCLMatrixType>
 ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
 {
    ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
-<<<<<<< HEAD:tests/src/blas3.cpp
-   viennacl::copy(mat2, mat2_cpu);
-   double ret = 0;
-   double act = 0;
-=======
    viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
    viennacl::copy(mat2, mat2_cpu);
    ScalarType ret = 0;
    ScalarType act = 0;
->>>>>>> upstream/1.5.1:tests/src/blas3_solve_double.cpp
 
     for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
     {
@@ -106,11 +100,6 @@ ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
          act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
          if (act > ret)
            ret = act;
-         if (act > 0.1)
-         {
-           std::cout << "Offending index: " << i << ", " << j << std::endl;
-           exit(0);
-         }
       }
     }
    //std::cout << ret << std::endl;
@@ -122,112 +111,8 @@ ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
 //
 // Triangular solvers
 //
-<<<<<<< HEAD:tests/src/blas3.cpp
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename Epsilon >
-int test_prod(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   //long matrix_size1 = 157;  //some odd number, not too large
-   //long matrix_size2 = 91;  //some odd number, not too large
-   //long matrix_size3 = 73;  //some odd number, not too large
-   long matrix_size1 = 128;  //some odd number, not too large
-   long matrix_size2 = 64;  //some odd number, not too large
-   long matrix_size3 = 128;  //some odd number, not too large
-   NumericT act_diff = 0;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
-   ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
-   ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
-
-   //fill A and B:
-   for (unsigned int i = 0; i < A.size1(); ++i)
-      for (unsigned int j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-   for (unsigned int i = 0; i < B.size1(); ++i)
-      for (unsigned int j = 0; j < B.size2(); ++j)
-         B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> B_trans = trans(B);
-   
-   MatrixTypeA vcl_A(matrix_size1, matrix_size2);
-   MatrixTypeB vcl_B(matrix_size2, matrix_size3);
-   MatrixTypeA vcl_A_trans(matrix_size2, matrix_size1);
-   MatrixTypeB vcl_B_trans(matrix_size3, matrix_size2);
-   MatrixTypeC vcl_C(matrix_size1, matrix_size3);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(A_trans, vcl_A_trans);
-   viennacl::copy(B_trans, vcl_B_trans);
-
-   // Test: C = A * B --------------------------------------------------------------------------       
-   C     = viennacl::linalg::prod(A, B);
-   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * B passed!" << std::endl;
-   
-   // Test: C = A * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(A, trans(B_trans));
-   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * trans(B) passed!" << std::endl;
-   
-   // Test: C = trans(A) * B --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), B);
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * B passed!" << std::endl;
-   
-   
-   // Test: C = trans(A) * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
-   
-   
-   
-   return retval;
-}
-=======
 
 
->>>>>>> upstream/1.5.1:tests/src/blas3_solve_double.cpp
 
 template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
 void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
diff --git a/tests/src/external_1.cpp b/tests/src/external_1.cpp
index 1c52375..68504ff 100644
--- a/tests/src/external_1.cpp
+++ b/tests/src/external_1.cpp
@@ -38,19 +38,12 @@
 #include "viennacl/coordinate_matrix.hpp"
 #include "viennacl/ell_matrix.hpp"
 #include "viennacl/hyb_matrix.hpp"
-<<<<<<< HEAD
-#include "viennacl/circulant_matrix.hpp"
-#include "viennacl/hankel_matrix.hpp"
-#include "viennacl/toeplitz_matrix.hpp"
-#include "viennacl/vandermonde_matrix.hpp"
-=======
 #ifdef VIENNACL_WITH_OPENCL
   #include "viennacl/circulant_matrix.hpp"
   #include "viennacl/hankel_matrix.hpp"
   #include "viennacl/toeplitz_matrix.hpp"
   #include "viennacl/vandermonde_matrix.hpp"
 #endif
->>>>>>> upstream/1.5.1
 
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/row_scaling.hpp"
@@ -60,7 +53,6 @@
 #include "viennacl/linalg/gmres.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
 #include "viennacl/linalg/qr.hpp"
-#include "viennacl/linalg/svd.hpp"
 
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
@@ -76,8 +68,6 @@
 #include "viennacl/scheduler/execute.hpp"
 
 
-#include "viennacl/generator/custom_operation.hpp"
-
 
 //defined in external_2.cpp
 void other_func();
diff --git a/tests/src/external_2.cpp b/tests/src/external_2.cpp
index 67dc830..bc3c34f 100644
--- a/tests/src/external_2.cpp
+++ b/tests/src/external_2.cpp
@@ -38,19 +38,12 @@
 #include "viennacl/coordinate_matrix.hpp"
 #include "viennacl/ell_matrix.hpp"
 #include "viennacl/hyb_matrix.hpp"
-<<<<<<< HEAD
-#include "viennacl/circulant_matrix.hpp"
-#include "viennacl/hankel_matrix.hpp"
-#include "viennacl/toeplitz_matrix.hpp"
-#include "viennacl/vandermonde_matrix.hpp"
-=======
 #ifdef VIENNACL_WITH_OPENCL
   #include "viennacl/circulant_matrix.hpp"
   #include "viennacl/hankel_matrix.hpp"
   #include "viennacl/toeplitz_matrix.hpp"
   #include "viennacl/vandermonde_matrix.hpp"
 #endif
->>>>>>> upstream/1.5.1
 
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/row_scaling.hpp"
@@ -60,10 +53,6 @@
 #include "viennacl/linalg/gmres.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
 #include "viennacl/linalg/qr.hpp"
-<<<<<<< HEAD
-#include "viennacl/linalg/svd.hpp"
-=======
->>>>>>> upstream/1.5.1
 
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
@@ -78,8 +67,6 @@
 #include "viennacl/io/matrix_market.hpp"
 #include "viennacl/scheduler/execute.hpp"
 
-#include "viennacl/generator/custom_operation.hpp"
-
 void other_func()
 {
   typedef float   NumericType;
diff --git a/tests/src/nmf.cpp b/tests/src/nmf.cpp
index 30aae67..5be2b4d 100644
--- a/tests/src/nmf.cpp
+++ b/tests/src/nmf.cpp
@@ -1,19 +1,3 @@
-<<<<<<< HEAD
-#include <ctime>
-#include <cmath>
-
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/nmf.hpp"
-
-#include "examples/benchmarks/benchmark-utils.hpp"
-
-typedef float ScalarType;
-
-const ScalarType EPS = 0.1;
-
-float matrix_compare(viennacl::matrix<ScalarType>& res,
-                     viennacl::matrix<ScalarType>& ref) 
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -44,7 +28,6 @@ const ScalarType EPS = ScalarType(0.1);
 
 float matrix_compare(viennacl::matrix<ScalarType>& res,
                      viennacl::matrix<ScalarType>& ref)
->>>>>>> upstream/1.5.1
 {
     std::vector<ScalarType> res_std(res.internal_size());
     std::vector<ScalarType> ref_std(ref.internal_size());
@@ -63,18 +46,6 @@ float matrix_compare(viennacl::matrix<ScalarType>& res,
     return diff / mx;
 }
 
-<<<<<<< HEAD
-void fill_random(std::vector<ScalarType>& v)
-{
-    for(std::size_t j = 0; j < v.size(); j++)
-        v[j] = static_cast<ScalarType>(rand()) / RAND_MAX;
-}
-
-void test_nmf(std::size_t m, std::size_t k, std::size_t n)
-{
-    std::vector<ScalarType> stl_w(m * k);
-    std::vector<ScalarType> stl_h(k * n);
-=======
 
 void fill_random(std::vector< std::vector<ScalarType> >& v)
 {
@@ -90,37 +61,11 @@ void test_nmf(std::size_t m, std::size_t k, std::size_t n)
 {
     std::vector< std::vector<ScalarType> > stl_w(m, std::vector<ScalarType>(k));
     std::vector< std::vector<ScalarType> > stl_h(k, std::vector<ScalarType>(n));
->>>>>>> upstream/1.5.1
 
     viennacl::matrix<ScalarType> v_ref(m, n);
     viennacl::matrix<ScalarType> w_ref(m, k);
     viennacl::matrix<ScalarType> h_ref(k, n);
 
-<<<<<<< HEAD
-    viennacl::matrix<ScalarType> v_nmf(m, n);
-    viennacl::matrix<ScalarType> w_nmf(m, k);
-    viennacl::matrix<ScalarType> h_nmf(k, n);
-
-    fill_random(stl_w);
-    fill_random(stl_h);
-
-    viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w_ref);
-    viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h_ref);
-
-    v_ref = viennacl::linalg::prod(w_ref, h_ref);
-
-    viennacl::ocl::get_queue().finish();
-
-    //Timer timer;
-    //timer.start();
-
-    viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, k);
-    viennacl::ocl::get_queue().finish();
-
-    //double time_spent = timer.get();
-
-    v_nmf = viennacl::linalg::prod(w_nmf, h_nmf);
-=======
     fill_random(stl_w);
     fill_random(stl_h);
 
@@ -145,35 +90,19 @@ void test_nmf(std::size_t m, std::size_t k, std::size_t n)
     viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, conf);
 
     viennacl::matrix<ScalarType> v_nmf = viennacl::linalg::prod(w_nmf, h_nmf);
->>>>>>> upstream/1.5.1
 
     float diff  = matrix_compare(v_ref, v_nmf);
     bool diff_ok = fabs(diff) < EPS;
 
-<<<<<<< HEAD
-    printf("%6s [%lux%lux%lu] diff = %.6f\n", diff_ok?"[[OK]]":"[FAIL]", m, k, n, diff);
-=======
     long iterations = static_cast<long>(conf.iters());
     printf("%6s [%lux%lux%lu] diff = %.6f (%ld iterations)\n", diff_ok ? "[[OK]]":"[FAIL]", m, k, n, diff, iterations);
 
     if (!diff_ok)
       exit(EXIT_FAILURE);
->>>>>>> upstream/1.5.1
 }
 
 int main()
 {
-<<<<<<< HEAD
-    srand(time(NULL));
-
-    test_nmf(3, 3, 3);
-    test_nmf(3, 2, 3);
-    test_nmf(16, 7, 12);
-    test_nmf(160, 73, 200);
-    test_nmf(1000, 15, 1000);
-
-    return 0;
-=======
   //srand(time(NULL));  //let's use deterministic tests, so keep the default srand() initialization
 
   test_nmf(3, 3, 3);
@@ -188,5 +117,4 @@ int main()
 
 
   return EXIT_SUCCESS;
->>>>>>> upstream/1.5.1
 }
diff --git a/tests/src/sparse.cpp b/tests/src/sparse.cpp
index efe956c..4d07000 100644
--- a/tests/src/sparse.cpp
+++ b/tests/src/sparse.cpp
@@ -685,107 +685,10 @@ int test(Epsilon const& epsilon)
 
     if( std::fabs(diff(result, vcl_result)) > epsilon )
     {
-<<<<<<< HEAD
-      std::cout << "Error reading Result file" << std::endl;
-      return EXIT_FAILURE;
-    }
-    std::cout << "done reading result" << std::endl;
-   
-
-   viennacl::vector<NumericT> vcl_rhs(rhs.size());
-   viennacl::vector<NumericT> vcl_result(result.size()); 
-   viennacl::vector<NumericT> vcl_result2(result.size()); 
-   viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
-   viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
-   viennacl::ell_matrix<NumericT> vcl_ell_matrix(rhs.size(), rhs.size());
-   viennacl::hyb_matrix<NumericT> vcl_hyb_matrix(rhs.size(), rhs.size());
-
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(ublas_matrix, vcl_compressed_matrix);
-   copy(ublas_matrix, vcl_coordinate_matrix);
-
-   // --------------------------------------------------------------------------          
-   std::cout << "Testing products: ublas" << std::endl;
-   result     = viennacl::linalg::prod(ublas_matrix, rhs);
-   
-   std::cout << "Testing products: compressed_matrix" << std::endl;
-   vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
-   
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   std::cout << "Copying ell_matrix" << std::endl;
-   copy(ublas_matrix, vcl_ell_matrix);
-   ublas_matrix.clear();
-   copy(vcl_ell_matrix, ublas_matrix);// just to check that it's works
-
-
-   std::cout << "Testing products: ell_matrix" << std::endl;
-   vcl_result.clear();
-   vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
-   //viennacl::linalg::prod_impl(vcl_ell_matrix, vcl_rhs, vcl_result);
-   //std::cout << vcl_result << "\n";
-   std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-   std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
-   
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with ell_matrix" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   
-   std::cout << "Copying hyb_matrix" << std::endl;
-   copy(ublas_matrix, vcl_hyb_matrix);
-   ublas_matrix.clear();
-   copy(vcl_hyb_matrix, ublas_matrix);// just to check that it's works
-   copy(ublas_matrix, vcl_hyb_matrix);
- 
-   std::cout << "Testing products: hyb_matrix" << std::endl;
-   vcl_result.clear();
-   vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
-   //viennacl::linalg::prod_impl(vcl_hyb_matrix, vcl_rhs, vcl_result);
-   //std::cout << vcl_result << "\n";
-   std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-   std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
-   
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with hyb_matrix" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   
-   // --------------------------------------------------------------------------            
-   // --------------------------------------------------------------------------            
-   NumericT alpha = static_cast<NumericT>(2.786);
-   NumericT beta = static_cast<NumericT>(1.432);
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(result.begin(), result.end(), vcl_result.begin());
-   copy(result.begin(), result.end(), vcl_result2.begin());
-
-   std::cout << "Testing scaled additions of products and vectors" << std::endl;
-   result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
-   vcl_result2 = alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result;
-
-   if( fabs(diff(result, vcl_result2)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result2)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-=======
       std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix (after copy back)" << std::endl;
       std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
       retval = EXIT_FAILURE;
     }
->>>>>>> upstream/1.5.1
 
   }
 
diff --git a/tests/src/svd.cpp b/tests/src/svd.cpp
index 10ac144..81775e6 100644
--- a/tests/src/svd.cpp
+++ b/tests/src/svd.cpp
@@ -1,5 +1,3 @@
-<<<<<<< HEAD
-=======
 /* =========================================================================
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
@@ -17,7 +15,6 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
->>>>>>> upstream/1.5.1
 #include <stdexcept>
 #include <iostream>
 #include <string>
@@ -31,102 +28,6 @@
 
 #include "examples/benchmarks/benchmark-utils.hpp"
 
-<<<<<<< HEAD
-typedef float ScalarType;
-
-const float EPS = 0.001;
-
-void read_matrix_size(std::fstream& f, unsigned int& sz1, unsigned int& sz2) {
-    if(!f.is_open())
-        throw std::invalid_argument("File is not opened");
-
-    f >> sz1 >> sz2;
-}
-
-void read_matrix_body(std::fstream& f, viennacl::matrix<ScalarType>& A) {
-    if(!f.is_open())
-        throw std::invalid_argument("File is not opened");
-
-	boost::numeric::ublas::matrix<float> h_A(A.size1(), A.size2());
-
-    for(unsigned int i = 0; i < h_A.size1(); i++) {
-        for(unsigned int j = 0; j < h_A.size2(); j++) {
-            ScalarType val = 0.0;
-            f >> val;
-            h_A(i, j) = val;
-        }
-    }
-
-	viennacl::copy(h_A, A);
-}
-
-void read_vector_body(std::fstream& f, std::vector<ScalarType>& v) {
-    if(!f.is_open())
-        throw std::invalid_argument("File is not opened");
-
-    for(unsigned int i = 0; i < v.size(); i++)
-    {
-            ScalarType val = 0.0;
-            f >> val;
-            v[i] = val;
-    }
-}
-
-void random_fill(std::vector<ScalarType>& in) {
-    for(unsigned int i = 0; i < in.size(); i++) {
-        in[i] = (float)rand() / RAND_MAX;
-    }
-}
-
-bool check_bidiag(viennacl::matrix<ScalarType>& A) {
-    const float EPS = 0.0001f;
-
-    std::vector<ScalarType> aA(A.size1() * A.size2());
-    viennacl::fast_copy(A, &aA[0]);
-
-    for(unsigned int i = 0; i < A.size1(); i++) {
-        for(unsigned int j = 0; j < A.size2(); j++) {
-            ScalarType val = aA[i * A.size2() + j];
-            if((fabs(val) > EPS) && (i != j) && ((i + 1) != j)) {
-                std::cout << "Failed at " << i << " " << j << " " << val << std::endl;
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-float matrix_compare(viennacl::matrix<ScalarType>& res,
-                     viennacl::matrix<ScalarType>& ref) 
-{
-    std::vector<ScalarType> res_std(res.internal_size());
-    std::vector<ScalarType> ref_std(ref.internal_size());
-
-    viennacl::fast_copy(res, &res_std[0]);
-    viennacl::fast_copy(ref, &ref_std[0]);
-
-    float diff = 0.0;
-    float mx = 0.0;
-
-    for(unsigned int i = 0; i < res_std.size(); i++) {
-        diff = std::max(diff, std::abs(res_std[i] - ref_std[i]));
-        mx = std::max(mx, res_std[i]);
-    }
-
-    return diff / mx;
-}
-
-float sigmas_compare(viennacl::matrix<ScalarType>& res, 
-                        std::vector<ScalarType>& ref) 
-{
-    std::vector<ScalarType> res_std(ref.size());
-
-    for(size_t i = 0; i < ref.size(); i++)
-    {
-        res_std[i] = res(i, i);
-    }
-=======
 
 void read_matrix_size(std::fstream& f, std::size_t & sz1, std::size_t & sz2)
 {
@@ -237,20 +138,13 @@ ScalarType sigmas_compare(viennacl::matrix<ScalarType>& res,
 
     for(std::size_t i = 0; i < ref.size(); i++)
         res_std[i] = res(i, i);
->>>>>>> upstream/1.5.1
 
     std::sort(ref.begin(), ref.end());
     std::sort(res_std.begin(), res_std.end());
 
-<<<<<<< HEAD
-    float diff = 0.0;
-    float mx = 0.0;
-    for(size_t i = 0; i < ref.size(); i++) 
-=======
     ScalarType diff = 0.0;
     ScalarType mx = 0.0;
     for(std::size_t i = 0; i < ref.size(); i++)
->>>>>>> upstream/1.5.1
     {
         diff = std::max(diff, std::abs(res_std[i] - ref[i]));
         mx = std::max(mx, res_std[i]);
@@ -260,95 +154,6 @@ ScalarType sigmas_compare(viennacl::matrix<ScalarType>& res,
 }
 
 
-<<<<<<< HEAD
-void test_svd(const std::string& fn) 
-{
-    unsigned int sz1, sz2;
-
-    //read matrix
-
-    // sz1 = 2048, sz2 = 2048;
-    // std::vector<ScalarType> in(sz1 * sz2);
-    // random_fill(in);
-
-    // read file
-    std::fstream f(fn.c_str(), std::fstream::in);
-    //read size of input matrix
-    read_matrix_size(f, sz1, sz2);
-
-    unsigned int to = std::min(sz1, sz2);
-
-    viennacl::matrix<ScalarType> Ai(sz1, sz2), Aref(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
-    read_matrix_body(f, Ai);
-
-    std::vector<ScalarType> sigma_ref(to);
-    read_vector_body(f, sigma_ref);
-
-    f.close();
-
-    // viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
-
-    Aref = Ai;
-
-    Timer timer;
-    timer.start();
-
-    viennacl::linalg::svd(Ai, QL, QR);
-
-    viennacl::ocl::get_queue().finish();
-
-    double time_spend = timer.get();
-
-    viennacl::matrix<ScalarType> result1(sz1, sz2), result2(sz1, sz2);
-    result1 = viennacl::linalg::prod(QL, Ai);
-    result2 = viennacl::linalg::prod(result1, trans(QR));
-
-    float sigma_diff = sigmas_compare(Ai, sigma_ref);
-    float prods_diff  = matrix_compare(result2, Aref);
-
-    bool sigma_ok = (fabs(sigma_diff) < EPS) && (fabs(prods_diff) < EPS);
-
-	printf("%6s [%dx%d] %40s sigma_diff = %.6f; prod_diff = %.6f; time = %.6f\n", sigma_ok?"[[OK]]":"[FAIL]", (int)Aref.size1(), (int)Aref.size2(), fn.c_str(), sigma_diff, prods_diff, time_spend);
-}
-
-
-void time_svd(size_t sz1, size_t sz2) 
-{
-
-    std::vector<ScalarType> in(sz1 * sz2);
-    random_fill(in);
-
-    viennacl::matrix<ScalarType> Ai(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
-
-    viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
-
-
-    Timer timer;
-    timer.start();
-
-    viennacl::linalg::svd(Ai, QL, QR);
-
-    viennacl::ocl::get_queue().finish();
-
-    double time_spend = timer.get();
-
-    printf("[%dx%d] time = %.6f\n", (int)sz1, (int)sz2, time_spend);
-}
-
-int main() 
-{
-
-    test_svd(std::string("../../examples/testdata/svd/qr.example"));
-    test_svd(std::string("../../examples/testdata/svd/wiki.example"));
-    test_svd(std::string("../../examples/testdata/svd/wiki.qr.example"));
-    test_svd(std::string("../../examples/testdata/svd/pysvd.example"));
-    test_svd(std::string("../../examples/testdata/svd/random.example"));
-
-    time_svd(500, 500);
-    time_svd(1000, 1000);
-    time_svd(4096, 512);
-    time_svd(2048, 2048);
-=======
 template <typename ScalarType>
 void test_svd(const std::string & fn, ScalarType EPS)
 {
@@ -438,13 +243,10 @@ int test(ScalarType epsilon)
     time_svd<ScalarType>(1000, 1000);
     time_svd<ScalarType>(4096, 512);
     time_svd<ScalarType>(2048, 2048);
->>>>>>> upstream/1.5.1
     //time_svd(4096, 4096);  //takes too long for a standard sanity test. Feel free to uncomment
 
     return EXIT_SUCCESS;
 }
-<<<<<<< HEAD
-=======
 
 //
 // -------------------------------------------------------------
@@ -507,4 +309,3 @@ int main()
 }
 
 
->>>>>>> upstream/1.5.1
diff --git a/viennacl/ell_matrix.hpp b/viennacl/ell_matrix.hpp
index 7ea705f..6e8af98 100644
--- a/viennacl/ell_matrix.hpp
+++ b/viennacl/ell_matrix.hpp
@@ -2,41 +2,25 @@
 #define VIENNACL_ELL_MATRIX_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-/** @file ell_matrix.hpp
-    @brief Implementation of the ell_matrix class
-    
-=======
 /** @file viennacl/ell_matrix.hpp
     @brief Implementation of the ell_matrix class
 
->>>>>>> upstream/1.5.1
     Contributed by Volodymyr Kysenko.
 */
 
@@ -45,55 +29,6 @@
 #include "viennacl/vector.hpp"
 
 #include "viennacl/tools/tools.hpp"
-<<<<<<< HEAD
-#include "viennacl/ocl/backend.hpp"
-
-#include "viennacl/linalg/kernels/ell_matrix_kernels.h"
-
-namespace viennacl
-{
-    template<typename SCALARTYPE, unsigned int ALIGNMENT /* see forwards.h for default argument */>
-    class ell_matrix
-    {
-
-      public:
-        ell_matrix() 
-        {
-          viennacl::linalg::kernels::ell_matrix<SCALARTYPE, ALIGNMENT>::init();
-        }
-        
-        ell_matrix(std::size_t row_num, std::size_t col_num) 
-        {
-          viennacl::linalg::kernels::ell_matrix<SCALARTYPE, ALIGNMENT>::init();
-        }
-    
-      public:
-        std::size_t internal_size1() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(rows_, ALIGNMENT); }
-        std::size_t internal_size2() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(cols_, ALIGNMENT); }
-
-        std::size_t size1() const { return rows_; }
-        std::size_t size2() const { return cols_; }
-        
-        std::size_t internal_maxnnz() const {return viennacl::tools::roundUpToNextMultiple<std::size_t>(maxnnz_, ALIGNMENT); }
-        std::size_t maxnnz() const { return maxnnz_; }
-
-        std::size_t nnz() const { return rows_ * maxnnz_; }
-        std::size_t internal_nnz() const { return internal_size1() * internal_maxnnz(); }
-
-        const viennacl::ocl::handle<cl_mem>& handle1( ) const { return elements_; } 
-        const viennacl::ocl::handle<cl_mem>& handle2() const { return coords_; }
-
-        template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
-        friend void copy(const CPU_MATRIX & cpu_matrix, ell_matrix<T, ALIGN> & gpu_matrix );
-
-      private:
-        std::size_t rows_;
-        std::size_t cols_;
-        std::size_t maxnnz_;
-
-        viennacl::ocl::handle<cl_mem> coords_;
-        viennacl::ocl::handle<cl_mem> elements_;        
-=======
 
 #include "viennacl/linalg/sparse_matrix_operations.hpp"
 
@@ -172,21 +107,11 @@ namespace viennacl
 
         handle_type coords_;
         handle_type elements_;
->>>>>>> upstream/1.5.1
     };
 
     template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(const CPU_MATRIX& cpu_matrix, ell_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix )
     {
-<<<<<<< HEAD
-      if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
-      {
-        //determine max capacity for row
-        std::size_t max_entries_per_row = 0;
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
-        {
-          std::size_t num_entries = 0;
-=======
       assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
       assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
 
@@ -197,7 +122,6 @@ namespace viennacl
         for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
         {
           vcl_size_t num_entries = 0;
->>>>>>> upstream/1.5.1
           for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
           {
               ++num_entries;
@@ -211,86 +135,35 @@ namespace viennacl
         gpu_matrix.rows_ = cpu_matrix.size1();
         gpu_matrix.cols_ = cpu_matrix.size2();
 
-<<<<<<< HEAD
-        std::size_t nnz = gpu_matrix.internal_nnz();
-
-        std::vector<cl_uint> coords(nnz, 0);
-        std::vector<SCALARTYPE> elements(nnz, 0);
-
-        // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " " 
-=======
         vcl_size_t nnz = gpu_matrix.internal_nnz();
 
         viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), nnz);
         std::vector<SCALARTYPE> elements(nnz, 0);
 
         // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " "
->>>>>>> upstream/1.5.1
         //             << gpu_matrix.internal_maxnnz() << "\n";
 
         for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
         {
-<<<<<<< HEAD
-          std::size_t data_index = 0;
-          
-          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
-          {
-            coords[gpu_matrix.internal_size1() * data_index + col_it.index1()]   = col_it.index2();
-=======
           vcl_size_t data_index = 0;
 
           for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
           {
             coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
->>>>>>> upstream/1.5.1
             elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
             //std::cout << *col_it << "\n";
               data_index++;
           }
         }
 
-<<<<<<< HEAD
-
-        gpu_matrix.coords_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_ONLY, coords);
-        gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_ONLY, elements);
-=======
         viennacl::backend::memory_create(gpu_matrix.handle2(), coords.raw_size(),                   traits::context(gpu_matrix.handle2()), coords.get());
         viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * elements.size(), traits::context(gpu_matrix.handle()), &(elements[0]));
->>>>>>> upstream/1.5.1
       }
     }
 
     template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(const ell_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix, CPU_MATRIX& cpu_matrix)
     {
-<<<<<<< HEAD
-      if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
-      {
-        cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2());
-
-        std::vector<SCALARTYPE> elements(gpu_matrix.internal_nnz());
-        std::vector<cl_uint> coords(gpu_matrix.internal_nnz());
-
-        cl_int err;
-
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(), CL_TRUE, 0, sizeof(SCALARTYPE) * elements.size(), &(elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(), CL_TRUE, 0, sizeof(cl_uint) * coords.size(), &(coords[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        viennacl::ocl::get_queue().finish();
-
-        for(std::size_t row = 0; row < gpu_matrix.size1(); row++)
-        {
-          for(std::size_t ind = 0; ind < gpu_matrix.internal_maxnnz(); ind++)
-          {
-            std::size_t offset = gpu_matrix.internal_size1() * ind + row;
-            
-            if(elements[offset] == static_cast<SCALARTYPE>(0.0))
-            {
-                continue;
-            }
-=======
       assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
       assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
 
@@ -310,7 +183,6 @@ namespace viennacl
 
             if(elements[offset] == static_cast<SCALARTYPE>(0.0))
                 continue;
->>>>>>> upstream/1.5.1
 
             if(coords[offset] >= gpu_matrix.size2())
             {
@@ -324,111 +196,6 @@ namespace viennacl
       }
     }
 
-<<<<<<< HEAD
-    namespace linalg
-    {
-      /** @brief Returns a proxy class that represents matrix-vector multiplication with a hyb_matrix
-      *
-      * This is used for the convenience expression result = prod(mat, vec);
-      *
-      * @param mat    The matrix
-      * @param vec    The vector
-      */
-      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      vector_expression<const ell_matrix<SCALARTYPE, ALIGNMENT>,
-                        const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                        op_prod > prod_impl(const ell_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                            const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-      {
-        return vector_expression<const ell_matrix<SCALARTYPE, ALIGNMENT>,
-                                 const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                 op_prod >(mat, vec);
-      }
-      
-      template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(
-                      const viennacl::ell_matrix<TYPE, ALIGNMENT>& mat, 
-                      const viennacl::vector<TYPE, VECTOR_ALIGNMENT>& vec,
-                      viennacl::vector<TYPE, VECTOR_ALIGNMENT>& result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-
-        result.clear();
-
-        std::stringstream ss;
-        ss << "vec_mul_" << 1;//(ALIGNMENT != 1?4:1);
-        viennacl::ocl::kernel& k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::ell_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
-
-        unsigned int thread_num = 128;
-        unsigned int group_num = 256;
-
-        k.local_work_size(0, thread_num);
-        k.global_work_size(0, thread_num * group_num);
-
-        viennacl::ocl::enqueue(k(mat.handle2(), 
-                                 mat.handle1(),
-                                 vec,
-                                 result,
-                                 cl_uint(mat.size1()),
-                                 cl_uint(mat.size2()),
-                                 cl_uint(mat.internal_size1()),
-                                 cl_uint(mat.maxnnz()),
-                                 cl_uint(mat.internal_maxnnz())
-                                ) 
-        );
-
-
-      }
-    }
-    
-    
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const ell_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-    
-}
-
-#endif
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-=======
 
     //
     // Specify available operations:
@@ -525,6 +292,5 @@ namespace viennacl
 }
 
 #endif
->>>>>>> upstream/1.5.1
 
 
diff --git a/viennacl/forwards.h b/viennacl/forwards.h
index 8d995c9..d2ba91f 100644
--- a/viennacl/forwards.h
+++ b/viennacl/forwards.h
@@ -24,11 +24,7 @@
 */
 
 /**
-<<<<<<< HEAD
- @mainpage Source Code Documentation for ViennaCL 1.3.0
-=======
  @mainpage Source Code Documentation for ViennaCL 1.5.1
->>>>>>> upstream/1.5.1
 
  This is the source code documentation of ViennaCL. Detailed information about the functions in ViennaCL can be found here.
 
@@ -325,12 +321,6 @@ namespace viennacl
   class hyb_matrix;
 
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
-  class ell_matrix;
-
-  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
-  class hyb_matrix;
-  
-  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class circulant_matrix;
 
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
@@ -350,43 +340,25 @@ namespace viennacl
 
   typedef basic_range<>  range;
 
-<<<<<<< HEAD
-  template <typename SizeType = std::size_t, typename DistanceType = std::ptrdiff_t>
-  class basic_slice;
-  
-=======
   template <typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
   class basic_slice;
 
->>>>>>> upstream/1.5.1
   typedef basic_slice<>  slice;
 
   template <typename VectorType>
   class vector_range;
-<<<<<<< HEAD
-  
-  template <typename VectorType>
-  class vector_slice;
-  
-=======
 
   template <typename VectorType>
   class vector_slice;
 
->>>>>>> upstream/1.5.1
   template <typename MatrixType>
   class matrix_range;
 
   template <typename MatrixType>
   class matrix_slice;
-<<<<<<< HEAD
-  
-  
-=======
 
 
   /** @brief Helper struct for checking whether a type is a host scalar type (e.g. float, double) */
->>>>>>> upstream/1.5.1
   template <typename T>
   struct is_cpu_scalar
   {
@@ -693,61 +665,6 @@ namespace viennacl
     vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec);
 
     //forward definition of prod_impl functions
-<<<<<<< HEAD
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> &, 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::compressed_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::compressed_matrix<SCALARTYPE, ALIGNMENT> & , 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & , 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::ell_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::ell_matrix<SCALARTYPE, ALIGNMENT> & , 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::hyb_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::hyb_matrix<SCALARTYPE, ALIGNMENT> & , 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-                                
-    //forward definition of inner_prod_impl function
-    /*template <typename V1, typename V2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value,
-                                  viennacl::scalar_expression< const V1, 
-                                                               const V2,
-                                                               viennacl::op_inner_prod >
-                                >::type
-    inner_prod_impl(V1 const & vec1,
-                    V2 const & vec2);*/
-    
-#ifndef _MSC_VER
-    template <typename V1, typename V2, typename S3>
-    void inner_prod_impl(V1 const & vec1,
-                         V2 const & vec2,
-                         S3 & result,
-                         typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                       && viennacl::is_vector<V2>::value
-                                                       && viennacl::is_scalar<S3>::value
-                                                     >::type * dummy = 0);
-#endif                                                   
-                    
-      
-=======
 
     template <typename NumericT, typename F>
     void prod_impl(const matrix_base<NumericT, F> & mat,
@@ -782,7 +699,6 @@ namespace viennacl
     }
 
 
->>>>>>> upstream/1.5.1
     /** @brief A tag class representing a lower triangular matrix */
     struct lower_tag
     {
diff --git a/viennacl/generator/forwards.h b/viennacl/generator/forwards.h
index c11ea53..fcf5edb 100644
--- a/viennacl/generator/forwards.h
+++ b/viennacl/generator/forwards.h
@@ -2,77 +2,22 @@
 #define VIENNACL_GENERATOR_FORWARDS_H
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-/** @file viennacl/generator/forwards.h
- *  @brief Forward declarations of the important structures for the kernel generator
- * 
- *  Generator code contributed by Philippe Tillet
- */
-
-#include <string>
-#include "viennacl/forwards.h"
-
-namespace viennacl 
-{
-  namespace generator
-  {
-
-    template<class T>
-    struct is_temporary;
-
-    template<class LHS, class OP_TYPE, class RHS, bool is_temporary = false>
-    class compound_node;
-
-    template<class T>
-    struct inner_prod_impl_t;
-
-    template< unsigned int ID, typename SCALARTYPE, unsigned int ALIGNMENT = 1>
-    class symbolic_vector;
-
-    template<class REF>
-    class tmp_symbolic_vector;
-
-    template<unsigned int ID,
-             typename SCALARTYPE, class F = viennacl::row_major, unsigned int ALIGNMENT = 1>
-    class symbolic_matrix;
-
-    template<class REF>
-    class tmp_symbolic_matrix;
-
-    template<unsigned int ID,typename SCALARTYPE>
-    class cpu_symbolic_scalar;
-
-    template<unsigned int ID,typename SCALARTYPE>
-    class gpu_symbolic_scalar;
-
-  }
-=======
 
 /** @file viennacl/generator/forwards.h
     @brief Forwards declaration
@@ -193,6 +138,5 @@ namespace viennacl{
 
   }
 
->>>>>>> upstream/1.5.1
 }
 #endif
diff --git a/viennacl/hyb_matrix.hpp b/viennacl/hyb_matrix.hpp
index 2adf074..d04de34 100644
--- a/viennacl/hyb_matrix.hpp
+++ b/viennacl/hyb_matrix.hpp
@@ -2,41 +2,25 @@
 #define VIENNACL_HYB_MATRIX_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-/** @file hyb_matrix.hpp
-    @brief Implementation of the hyb_matrix class
-    
-=======
 /** @file viennacl/hyb_matrix.hpp
     @brief Implementation of the hyb_matrix class
 
->>>>>>> upstream/1.5.1
     Contributed by Volodymyr Kysenko.
 */
 
@@ -44,27 +28,6 @@
 #include "viennacl/vector.hpp"
 
 #include "viennacl/tools/tools.hpp"
-<<<<<<< HEAD
-#include "viennacl/ocl/backend.hpp"
-
-#include "viennacl/linalg/kernels/hyb_matrix_kernels.h"
-
-namespace viennacl
-{
-    template<typename SCALARTYPE, unsigned int ALIGNMENT  /* see forwards.h for default argument */>
-    class hyb_matrix
-    {
-
-      public:
-        hyb_matrix() : csr_threshold_(0.8), rows_(0), cols_(0) 
-        {
-          viennacl::linalg::kernels::hyb_matrix<SCALARTYPE, ALIGNMENT>::init();
-        }
-        
-        hyb_matrix(std::size_t row_num, std::size_t col_num) : csr_threshold_(0.8), rows_(row_num), cols_(col_num)
-        {
-          viennacl::linalg::kernels::hyb_matrix<SCALARTYPE, ALIGNMENT>::init();
-=======
 
 #include "viennacl/linalg/sparse_matrix_operations.hpp"
 
@@ -100,47 +63,11 @@ namespace viennacl
             csr_elements_.opencl_handle().context(ctx.opencl_context());
           }
 #endif
->>>>>>> upstream/1.5.1
         }
 
         SCALARTYPE  csr_threshold()  const { return csr_threshold_; }
         void csr_threshold(SCALARTYPE thr) { csr_threshold_ = thr; }
 
-<<<<<<< HEAD
-        std::size_t internal_size1() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(rows_, ALIGNMENT); }
-        std::size_t internal_size2() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(cols_, ALIGNMENT); }
-
-        std::size_t size1() const { return rows_; }
-        std::size_t size2() const { return cols_; }
-
-        std::size_t internal_ellnnz() const {return viennacl::tools::roundUpToNextMultiple<std::size_t>(ellnnz_, ALIGNMENT); }
-        std::size_t ell_nnz() const { return ellnnz_; }
-        std::size_t csr_nnz() const { return csrnnz_; }
-
-        const viennacl::ocl::handle<cl_mem>& handle1() const { return ell_elements_; } 
-        const viennacl::ocl::handle<cl_mem>& handle2() const { return ell_coords_; }
-        const viennacl::ocl::handle<cl_mem>& handle3() const { return csr_rows_; } 
-        const viennacl::ocl::handle<cl_mem>& handle4() const { return csr_cols_; } 
-        const viennacl::ocl::handle<cl_mem>& handle5() const { return csr_elements_; }  
-    
-      public:    
-        template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
-        friend void copy(const CPU_MATRIX & cpu_matrix, hyb_matrix<T, ALIGN> & gpu_matrix );
-
-      private:
-        SCALARTYPE  csr_threshold_;
-        std::size_t rows_;
-        std::size_t cols_;
-        std::size_t ellnnz_;
-        std::size_t csrnnz_;
-
-        viennacl::ocl::handle<cl_mem> ell_coords_; // ell coords
-        viennacl::ocl::handle<cl_mem> ell_elements_; // ell elements
-        
-        viennacl::ocl::handle<cl_mem> csr_rows_;
-        viennacl::ocl::handle<cl_mem> csr_cols_;
-        viennacl::ocl::handle<cl_mem> csr_elements_;
-=======
         vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, ALIGNMENT); }
         vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, ALIGNMENT); }
 
@@ -179,23 +106,11 @@ namespace viennacl
         handle_type csr_rows_;
         handle_type csr_cols_;
         handle_type csr_elements_;
->>>>>>> upstream/1.5.1
     };
 
     template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(const CPU_MATRIX& cpu_matrix, hyb_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix )
     {
-<<<<<<< HEAD
-      if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
-      {
-        //determine max capacity for row
-        std::size_t max_entries_per_row = 0;
-        std::vector<std::size_t> hist_entries(cpu_matrix.size1(), 0);
-
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
-        {
-            std::size_t num_entries = 0;
-=======
       assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
       assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
 
@@ -208,7 +123,6 @@ namespace viennacl
         for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
         {
             vcl_size_t num_entries = 0;
->>>>>>> upstream/1.5.1
             for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
             {
                 ++num_entries;
@@ -217,21 +131,12 @@ namespace viennacl
             hist_entries[num_entries] += 1;
             max_entries_per_row = std::max(max_entries_per_row, num_entries);
         }
-<<<<<<< HEAD
-        
-        std::size_t sum = 0;
-        for(std::size_t ind = 0; ind <= max_entries_per_row; ind++)
-        {
-            sum += hist_entries[ind];
-            
-=======
 
         vcl_size_t sum = 0;
         for(vcl_size_t ind = 0; ind <= max_entries_per_row; ind++)
         {
             sum += hist_entries[ind];
 
->>>>>>> upstream/1.5.1
             if(sum >= gpu_matrix.csr_threshold() * cpu_matrix.size1())
             {
                 max_entries_per_row = ind;
@@ -244,25 +149,6 @@ namespace viennacl
         gpu_matrix.rows_ = cpu_matrix.size1();
         gpu_matrix.cols_ = cpu_matrix.size2();
 
-<<<<<<< HEAD
-        std::size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
-
-        std::vector<cl_uint> ell_coords(nnz, 0);
-        std::vector<cl_uint> csr_rows(cpu_matrix.size1() + 1, 0);
-        std::vector<cl_uint> csr_cols;
-
-        std::vector<SCALARTYPE> ell_elements(nnz, 0.0f);
-        std::vector<SCALARTYPE> csr_elements;
-
-        std::size_t csr_index = 0;
-
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
-        {
-          std::size_t data_index = 0;
-  
-          csr_rows[row_it.index1()] = csr_index;
-          
-=======
         vcl_size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
 
         viennacl::backend::typesafe_host_array<unsigned int>  ell_coords(gpu_matrix.ell_coords_, nnz);
@@ -280,26 +166,16 @@ namespace viennacl
 
           csr_rows.set(row_it.index1(), csr_index);
 
->>>>>>> upstream/1.5.1
           for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
           {
             if(data_index < max_entries_per_row)
             {
-<<<<<<< HEAD
-                ell_coords[gpu_matrix.internal_size1() * data_index + col_it.index1()]   = col_it.index2();
-                ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;                        
-            }
-            else
-            {
-                csr_cols.push_back(col_it.index2());
-=======
                 ell_coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
                 ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
             }
             else
             {
                 csr_cols.push_back(static_cast<unsigned int>(col_it.index2()));
->>>>>>> upstream/1.5.1
                 csr_elements.push_back(*col_it);
 
                 csr_index++;
@@ -316,19 +192,6 @@ namespace viennacl
           csr_elements.push_back(0);
         }
 
-<<<<<<< HEAD
-        csr_rows[csr_rows.size() - 1] = csr_index;
-
-        gpu_matrix.csrnnz_ = csr_cols.size();
-
-        gpu_matrix.ell_coords_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, ell_coords);
-        gpu_matrix.ell_elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, ell_elements);
-
-        gpu_matrix.csr_rows_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, csr_rows);
-        gpu_matrix.csr_cols_   = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, csr_cols);
-        gpu_matrix.csr_elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, csr_elements);
-
-=======
         csr_rows.set(csr_rows.size() - 1, csr_index);
 
         gpu_matrix.csrnnz_ = csr_cols.size();
@@ -343,47 +206,12 @@ namespace viennacl
         viennacl::backend::memory_create(gpu_matrix.csr_rows_,     csr_rows.raw_size(),                      traits::context(gpu_matrix.csr_rows_), csr_rows.get());
         viennacl::backend::memory_create(gpu_matrix.csr_cols_,     csr_cols_for_gpu.raw_size(),              traits::context(gpu_matrix.csr_cols_), csr_cols_for_gpu.get());
         viennacl::backend::memory_create(gpu_matrix.csr_elements_, sizeof(SCALARTYPE) * csr_elements.size(), traits::context(gpu_matrix.csr_elements_), &(csr_elements[0]));
->>>>>>> upstream/1.5.1
       }
     }
 
     template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(const hyb_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix, CPU_MATRIX& cpu_matrix)
     {
-<<<<<<< HEAD
-      if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
-      {
-        cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2());
-
-        std::vector<SCALARTYPE> ell_elements(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
-        std::vector<cl_uint> ell_coords(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
-
-        std::vector<SCALARTYPE> csr_elements(gpu_matrix.csr_nnz());
-        std::vector<cl_uint> csr_rows(gpu_matrix.size1() + 1);
-        std::vector<cl_uint> csr_cols(gpu_matrix.csr_nnz());
-
-        cl_int err;
-
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(), CL_TRUE, 0, sizeof(SCALARTYPE) * ell_elements.size(), &(ell_elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(), CL_TRUE, 0, sizeof(cl_uint) * ell_coords.size(), &(ell_coords[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle3(), CL_TRUE, 0, sizeof(cl_uint) * csr_rows.size(), &(csr_rows[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle4(), CL_TRUE, 0, sizeof(cl_uint) * csr_cols.size(), &(csr_cols[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle5(), CL_TRUE, 0, sizeof(SCALARTYPE) * csr_elements.size(), &(csr_elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        viennacl::ocl::get_queue().finish();
-
-        for(std::size_t row = 0; row < gpu_matrix.size1(); row++)
-        {
-          for(std::size_t ind = 0; ind < gpu_matrix.internal_ellnnz(); ind++)
-          {
-            std::size_t offset = gpu_matrix.internal_size1() * ind + row;
-            
-=======
       assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
       assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
 
@@ -409,7 +237,6 @@ namespace viennacl
           {
             vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
 
->>>>>>> upstream/1.5.1
             if(ell_elements[offset] == static_cast<SCALARTYPE>(0.0))
             {
               continue;
@@ -424,11 +251,7 @@ namespace viennacl
             cpu_matrix(row, ell_coords[offset]) = ell_elements[offset];
           }
 
-<<<<<<< HEAD
-          for(std::size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
-=======
           for(vcl_size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
->>>>>>> upstream/1.5.1
           {
             if(csr_elements[ind] == static_cast<SCALARTYPE>(0.0))
             {
@@ -448,93 +271,6 @@ namespace viennacl
     }
 
 
-<<<<<<< HEAD
-    namespace linalg
-    {
-      
-      /** @brief Returns a proxy class that represents matrix-vector multiplication with a hyb_matrix
-      *
-      * This is used for the convenience expression result = prod(mat, vec);
-      *
-      * @param mat    The matrix
-      * @param vec    The vector
-      */
-      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      vector_expression<const hyb_matrix<SCALARTYPE, ALIGNMENT>,
-                        const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                        op_prod > prod_impl(const hyb_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                      const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-      {
-        return vector_expression<const hyb_matrix<SCALARTYPE, ALIGNMENT>,
-                                const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod >(mat, vec);
-      }
-      
-      template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl( const viennacl::hyb_matrix<TYPE, ALIGNMENT>& mat, 
-                      const viennacl::vector<TYPE, VECTOR_ALIGNMENT>& vec,
-                      viennacl::vector<TYPE, VECTOR_ALIGNMENT>& result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-
-        result.clear();
-
-        viennacl::ocl::kernel& k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::hyb_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
-
-        unsigned int thread_num = 256;
-        unsigned int group_num = 32;
-
-        k.local_work_size(0, thread_num);
-        k.global_work_size(0, thread_num * group_num);
-
-        viennacl::ocl::enqueue(k(mat.handle2(), 
-                                mat.handle1(),
-                                mat.handle3(),
-                                mat.handle4(),
-                                mat.handle5(),
-                                vec,
-                                result,
-                                cl_uint(mat.size1()),
-                                cl_uint(mat.internal_size1()),
-                                cl_uint(mat.ell_nnz()),
-                                cl_uint(mat.internal_ellnnz())
-                                ) 
-        );
-      }
-    }
-
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const hyb_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-
-}
-
-#endif
-=======
     //
     // Specify available operations:
     //
@@ -630,4 +366,3 @@ namespace viennacl
 }
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/linalg/bicgstab.hpp b/viennacl/linalg/bicgstab.hpp
index 38dedbd..642b490 100644
--- a/viennacl/linalg/bicgstab.hpp
+++ b/viennacl/linalg/bicgstab.hpp
@@ -108,15 +108,6 @@ namespace viennacl
       CPU_ScalarType beta;
       CPU_ScalarType alpha;
       CPU_ScalarType omega;
-<<<<<<< HEAD
-      ScalarType inner_prod_temp; //temporary variable for inner product computation
-      ScalarType new_ip_rr0star = 0;
-      
-      if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
-        return result;
-      
-      for (unsigned int i = 0; i < tag.max_iterations(); ++i)
-=======
       //ScalarType inner_prod_temp; //temporary variable for inner product computation
       CPU_ScalarType new_ip_rr0star = 0;
       CPU_ScalarType residual_norm = norm_rhs_host;
@@ -127,7 +118,6 @@ namespace viennacl
       bool restart_flag = true;
       vcl_size_t last_restart = 0;
       for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
->>>>>>> upstream/1.5.1
       {
         if (restart_flag)
         {
@@ -215,14 +205,6 @@ namespace viennacl
       CPU_ScalarType beta;
       CPU_ScalarType alpha;
       CPU_ScalarType omega;
-<<<<<<< HEAD
-      ScalarType new_ip_rr0star = 0;
-      ScalarType inner_prod_temp; //temporary variable for inner product
-      
-      if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
-        return result;
-      
-=======
       CPU_ScalarType new_ip_rr0star = 0;
       CPU_ScalarType residual_norm = norm_rhs_host;
 
@@ -231,7 +213,6 @@ namespace viennacl
 
       bool restart_flag = true;
       vcl_size_t last_restart = 0;
->>>>>>> upstream/1.5.1
       for (unsigned int i = 0; i < tag.max_iterations(); ++i)
       {
         if (restart_flag)
diff --git a/viennacl/linalg/bisect.hpp b/viennacl/linalg/bisect.hpp
index 5cb9006..3c04917 100644
--- a/viennacl/linalg/bisect.hpp
+++ b/viennacl/linalg/bisect.hpp
@@ -2,27 +2,17 @@
 #define VIENNACL_LINALG_BISECT_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -30,11 +20,7 @@
 
 /** @file viennacl/linalg/bisect.hpp
 *   @brief Implementation of the algorithm for finding eigenvalues of a tridiagonal matrix.
-<<<<<<< HEAD
-* 
-=======
 *
->>>>>>> upstream/1.5.1
 *   Contributed by Guenther Mader and Astrid Rupp.
 */
 
@@ -46,15 +32,6 @@
 
 namespace viennacl
 {
-<<<<<<< HEAD
-  namespace linalg 
-  {
-    
-    namespace detail
-    {
-      /** 
-      *    @brief overloaded function for copying vectors 
-=======
   namespace linalg
   {
 
@@ -62,41 +39,22 @@ namespace viennacl
     {
       /**
       *    @brief overloaded function for copying vectors
->>>>>>> upstream/1.5.1
       */
       template <typename T, typename OtherVectorType>
       void copy_vec_to_vec(viennacl::vector<T> const & src, OtherVectorType & dest)
       {
-<<<<<<< HEAD
-        viennacl::copy(src, dest); 
-=======
         viennacl::copy(src, dest);
->>>>>>> upstream/1.5.1
       }
 
       template <typename OtherVectorType, typename T>
       void copy_vec_to_vec(OtherVectorType const & src, viennacl::vector<T> & dest)
       {
-<<<<<<< HEAD
-        viennacl::copy(src, dest); 
-=======
         viennacl::copy(src, dest);
->>>>>>> upstream/1.5.1
       }
 
       template <typename VectorType1, typename VectorType2>
       void copy_vec_to_vec(VectorType1 const & src, VectorType2 & dest)
       {
-<<<<<<< HEAD
-        for (std::size_t i=0; i<src.size(); ++i)
-          dest[i] = src[i]; 
-      }
-    }
-    
-    /** 
-    *   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
-    *   
-=======
         for (vcl_size_t i=0; i<src.size(); ++i)
           dest[i] = src[i];
       }
@@ -105,7 +63,6 @@ namespace viennacl
     /**
     *   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
     *
->>>>>>> upstream/1.5.1
     *   @param alphas       Elements of the main diagonal
     *   @param betas        Elements of the secondary diagonal
     *   @return             Returns the eigenvalues of the tridiagonal matrix defined by alpha and beta
@@ -113,15 +70,6 @@ namespace viennacl
     template< typename VectorT >
     std::vector<
             typename viennacl::result_of::cpu_value_type<typename VectorT::value_type>::type
-<<<<<<< HEAD
-            > 
-    bisect(VectorT const & alphas, VectorT const & betas)
-    {
-      typedef typename viennacl::result_of::value_type<VectorT>::type           ScalarType;
-      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;  
-
-      std::size_t size = betas.size();
-=======
             >
     bisect(VectorT const & alphas, VectorT const & betas)
     {
@@ -129,7 +77,6 @@ namespace viennacl
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 
       vcl_size_t size = betas.size();
->>>>>>> upstream/1.5.1
       std::vector<CPU_ScalarType>  x_temp(size);
 
 
@@ -138,19 +85,6 @@ namespace viennacl
 
       double rel_error = std::numeric_limits<CPU_ScalarType>::epsilon();
       beta_bisect.push_back(0);
-<<<<<<< HEAD
-    
-      for(std::size_t i = 1; i < size; i++){
-              beta_bisect.push_back(betas[i] * betas[i]);
-      }
-
-      double xmin = alphas[size - 1] - std::abs<CPU_ScalarType>(betas[size - 1]);
-      double xmax = alphas[size - 1] + std::abs<CPU_ScalarType>(betas[size - 1]);
-
-      for(std::size_t i = 0; i < size - 1; i++)
-      {
-        double h = std::abs<CPU_ScalarType>(betas[i]) + std::abs<CPU_ScalarType>(betas[i + 1]);
-=======
 
       for(vcl_size_t i = 1; i < size; i++){
               beta_bisect.push_back(betas[i] * betas[i]);
@@ -162,18 +96,13 @@ namespace viennacl
       for(vcl_size_t i = 0; i < size - 1; i++)
       {
         double h = std::fabs(betas[i]) + std::fabs(betas[i + 1]);
->>>>>>> upstream/1.5.1
         if (alphas[i] + h > xmax)
           xmax = alphas[i] + h;
         if (alphas[i] - h < xmin)
           xmin = alphas[i] - h;
       }
 
-<<<<<<< HEAD
-      
-=======
 
->>>>>>> upstream/1.5.1
       double eps1 = 1e-6;
       /*double eps2 = (xmin + xmax > 0) ? (rel_error * xmax) : (-rel_error * xmin);
       if(eps1 <= 0)
@@ -183,21 +112,13 @@ namespace viennacl
 
       double x0 = xmax;
 
-<<<<<<< HEAD
-      for(std::size_t i = 0; i < size; i++)
-=======
       for(vcl_size_t i = 0; i < size; i++)
->>>>>>> upstream/1.5.1
       {
         x_temp[i] = xmax;
         wu.push_back(xmin);
       }
 
-<<<<<<< HEAD
-      for(long k = size - 1; k >= 0; --k)
-=======
       for(long k = static_cast<long>(size) - 1; k >= 0; --k)
->>>>>>> upstream/1.5.1
       {
         double xu = xmin;
         for(long i = k; i >= 0; --i)
@@ -208,48 +129,27 @@ namespace viennacl
             break;
           }
         }
-<<<<<<< HEAD
-        
-=======
 
->>>>>>> upstream/1.5.1
         if(x0 > x_temp[k])
           x0 = x_temp[k];
 
         double x1 = (xu + x0) / 2.0;
-<<<<<<< HEAD
-        while (x0 - xu > 2.0 * rel_error * (std::abs(xu) + std::abs(x0)) + eps1)
-        {
-          std::size_t a = 0;
-          double q = 1;
-          for(std::size_t i = 0; i < size; i++)
-=======
         while (x0 - xu > 2.0 * rel_error * (std::fabs(xu) + std::fabs(x0)) + eps1)
         {
           vcl_size_t a = 0;
           double q = 1;
           for(vcl_size_t i = 0; i < size; i++)
->>>>>>> upstream/1.5.1
           {
             if(q != 0)
               q = alphas[i] - x1 - beta_bisect[i] / q;
             else
-<<<<<<< HEAD
-              q = alphas[i] - x1 - std::abs(betas[i] / rel_error);
-=======
               q = alphas[i] - x1 - std::fabs(betas[i] / rel_error);
->>>>>>> upstream/1.5.1
 
             if(q < 0)
               a++;
           }
-<<<<<<< HEAD
-          
-          if (a <= static_cast<std::size_t>(k))
-=======
 
           if (a <= static_cast<vcl_size_t>(k))
->>>>>>> upstream/1.5.1
           {
             xu = x1;
             if(a < 1)
@@ -270,14 +170,7 @@ namespace viennacl
       }
       return x_temp;
     }
-<<<<<<< HEAD
-    
-  } // end namespace linalg
-} // end namespace viennacl
-#endif
-=======
 
   } // end namespace linalg
 } // end namespace viennacl
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/linalg/cg.hpp b/viennacl/linalg/cg.hpp
index c68bfb9..e981239 100644
--- a/viennacl/linalg/cg.hpp
+++ b/viennacl/linalg/cg.hpp
@@ -107,15 +107,9 @@ namespace viennacl
       CPU_ScalarType norm_rhs = std::sqrt(ip_rr);
 
       //std::cout << "Starting CG solver iterations... " << std::endl;
-<<<<<<< HEAD
-      if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
-        return result;
-      
-=======
       if (norm_rhs == 0) //solution is zero if RHS norm is zero
         return result;
 
->>>>>>> upstream/1.5.1
       for (unsigned int i = 0; i < tag.max_iterations(); ++i)
       {
         tag.iters(i+1);
@@ -183,11 +177,7 @@ namespace viennacl
 
       if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
         return result;
-<<<<<<< HEAD
-      
-=======
 
->>>>>>> upstream/1.5.1
       for (unsigned int i = 0; i < tag.max_iterations(); ++i)
       {
         tag.iters(i+1);
diff --git a/viennacl/linalg/detail/ilu/block_ilu.hpp b/viennacl/linalg/detail/ilu/block_ilu.hpp
index 76faa61..406553a 100644
--- a/viennacl/linalg/detail/ilu/block_ilu.hpp
+++ b/viennacl/linalg/detail/ilu/block_ilu.hpp
@@ -2,38 +2,24 @@
 #define VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/detail/ilu/block_ilu.hpp
-<<<<<<< HEAD
-    @brief Implementations of incomplete factorization preconditioners
-=======
     @brief Implementations of incomplete block factorization preconditioners
->>>>>>> upstream/1.5.1
 */
 
 #include <vector>
@@ -52,87 +38,6 @@ namespace viennacl
   {
     namespace detail
     {
-<<<<<<< HEAD
-      template <typename VectorType>
-      class ilu_vector_range
-      {
-        public:
-          typedef typename VectorType::value_type      value_type;
-          typedef typename VectorType::size_type       size_type;
-          
-          ilu_vector_range(VectorType & v,
-                           size_type start_index,
-                           size_type vec_size
-                          ) : vec_(v), start_(start_index), size_(vec_size) {}
-          
-          value_type & operator()(size_type index)
-          {
-            assert(index < size_ && "Index out of bounds!");
-            
-            return vec_[start_ + index];  
-          }
-          
-          value_type & operator[](size_type index)
-          {
-            return this->operator()(index);
-          }
-          
-          size_type size() const { return size_; }
-          
-        private:
-          VectorType & vec_;
-          size_type start_;
-          size_type size_;
-      };
-      
-      /** @brief Extracts a diagonal block from a larger system matrix
-        *
-        * @param compressed_matrix   The full matrix
-        * @param block_matrix        The output matrix, to which the extracted block is written to
-        * @param start_index         First row- and column-index of the block
-        * @param stop_index          First row- and column-index beyond the block
-        */
-      template <typename MatrixType, typename STLMatrixType>
-      void extract_block_matrix(MatrixType const & compressed_matrix,
-                                STLMatrixType & block_matrix,
-                                std::size_t start_index,
-                                std::size_t stop_index
-                                )
-      {
-        typedef typename MatrixType::const_iterator1     RowIterator;
-        typedef typename MatrixType::const_iterator2     ColumnIterator;
-
-        for (RowIterator row_iter = compressed_matrix.begin1();
-                        row_iter != compressed_matrix.end1();
-                      ++row_iter)
-        {
-          if (row_iter.index1() < start_index)
-            continue;
-
-          if (row_iter.index1() >= stop_index)
-            break;
-
-          for (ColumnIterator col_iter = row_iter.begin();
-                              col_iter != row_iter.end();
-                            ++col_iter)
-          {
-            if (col_iter.index2() < start_index)
-              continue;
-
-            if (col_iter.index2() >= static_cast<std::size_t>(stop_index))
-              continue;
-
-            block_matrix[col_iter.index1() - start_index][col_iter.index2() - start_index] = *col_iter;
-          }
-        }
-      }
-          
-      
-    }
-
-    /** @brief A block ILU preconditioner class, can be supplied to solve()-routines
-     * 
-=======
       /** @brief Helper range class for representing a subvector of a larger buffer. */
       template <typename VectorType, typename ValueType, typename SizeType = vcl_size_t>
       class ilu_vector_range
@@ -223,7 +128,6 @@ namespace viennacl
 
     /** @brief A block ILU preconditioner class, can be supplied to solve()-routines
      *
->>>>>>> upstream/1.5.1
      * @tparam MatrixType   Type of the system matrix
      * @tparam ILUTag       Type of the tag identifiying the ILU preconditioner to be used on each block.
     */
@@ -231,33 +135,6 @@ namespace viennacl
     class block_ilu_precond
     {
       typedef typename MatrixType::value_type      ScalarType;
-<<<<<<< HEAD
-      typedef std::vector< std::map<unsigned int, ScalarType> >   InternalMatrixType;
-      
-      public:
-        typedef std::vector<std::pair<std::size_t, std::size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
-        
-        
-        block_ilu_precond(MatrixType const & mat,
-                          ILUTag const & tag,
-                          std::size_t num_blocks = 4
-                         ) : tag_(tag), LU_blocks(num_blocks)
-        {
-          
-          // Set up vector of block indices:
-          block_indices_.resize(num_blocks);
-          for (std::size_t i=0; i<num_blocks; ++i)
-          {
-            std::size_t start_index = (   i  * mat.size1()) / num_blocks;
-            std::size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
-            
-            block_indices_[i] = std::pair<std::size_t, std::size_t>(start_index, stop_index);
-          }
-          
-          //initialize preconditioner:
-          //std::cout << "Start CPU precond" << std::endl;
-          init(mat);          
-=======
 
       public:
         typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
@@ -282,7 +159,6 @@ namespace viennacl
           //initialize preconditioner:
           //std::cout << "Start CPU precond" << std::endl;
           init(mat);
->>>>>>> upstream/1.5.1
           //std::cout << "End CPU precond" << std::endl;
         }
 
@@ -293,55 +169,6 @@ namespace viennacl
         {
           //initialize preconditioner:
           //std::cout << "Start CPU precond" << std::endl;
-<<<<<<< HEAD
-          init(mat);          
-          //std::cout << "End CPU precond" << std::endl;
-        }
-        
-        
-        template <typename VectorType>
-        void apply(VectorType & vec) const
-        {
-          for (std::size_t i=0; i<block_indices_.size(); ++i)
-          {
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU_blocks[i],
-                                                                                      LU_blocks[i].size(),
-                                                                                      LU_blocks[i].size());
-            detail::ilu_vector_range<VectorType>  vec_range(vec,
-                                                            block_indices_[i].first,
-                                                            LU_blocks[i].size());
-            viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec_range);
-          }
-        }
-        
-      private:
-        void init(MatrixType const & mat)
-        {
-          
-          for (std::size_t i=0; i<block_indices_.size(); ++i)
-          {
-            // Step 1: Extract blocks
-            std::size_t block_size = block_indices_[i].second - block_indices_[i].first;
-            InternalMatrixType mat_block(block_size);
-            detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
-            
-            
-            // Step 2: Precondition blocks:
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType>  mat_block_adapter(mat_block, block_size, block_size);
-            viennacl::tools::sparse_matrix_adapter<ScalarType>        LU_adapter(LU_blocks[i], block_size, block_size);
-            viennacl::linalg::precondition(mat_block_adapter, LU_adapter, tag_);
-          }
-          
-        }
-
-        
-        ILUTag const & tag_;
-        index_vector_type block_indices_;
-        std::vector< InternalMatrixType > LU_blocks;
-    };
-
-    
-=======
           init(mat);
           //std::cout << "End CPU precond" << std::endl;
         }
@@ -421,7 +248,6 @@ namespace viennacl
 
 
 
->>>>>>> upstream/1.5.1
     /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
     *
     *  Specialization for compressed_matrix
@@ -430,35 +256,6 @@ namespace viennacl
     class block_ilu_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT>, ILUTag >
     {
         typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>        MatrixType;
-<<<<<<< HEAD
-        typedef std::vector< std::map<unsigned int, ScalarType> >   InternalMatrixType;
-        typedef std::vector<ScalarType>                             STLVectorType;
-      
-      public:
-        typedef std::vector<std::pair<std::size_t, std::size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
-          
-        
-        
-        block_ilu_precond(MatrixType const & mat,
-                          ILUTag const & tag,
-                          std::size_t num_blocks = 4
-                         ) : tag_(tag), LU_blocks(num_blocks)
-        {
-          
-          // Set up vector of block indices:
-          block_indices_.resize(num_blocks);
-          for (std::size_t i=0; i<num_blocks; ++i)
-          {
-            std::size_t start_index = (   i  * mat.size1()) / num_blocks;
-            std::size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
-            
-            block_indices_[i] = std::pair<std::size_t, std::size_t>(start_index, stop_index);
-          }
-          
-          //initialize preconditioner:
-          //std::cout << "Start CPU precond" << std::endl;
-          init(mat);          
-=======
         //typedef std::vector<ScalarType>                             STLVectorType;
 
       public:
@@ -489,82 +286,12 @@ namespace viennacl
           //initialize preconditioner:
           //std::cout << "Start CPU precond" << std::endl;
           init(mat);
->>>>>>> upstream/1.5.1
           //std::cout << "End CPU precond" << std::endl;
         }
 
         block_ilu_precond(MatrixType const & mat,
                           ILUTag const & tag,
                           index_vector_type const & block_boundaries
-<<<<<<< HEAD
-                         ) : tag_(tag), block_indices_(block_boundaries), LU_blocks(block_boundaries.size())
-        {
-          //initialize preconditioner:
-          //std::cout << "Start CPU precond" << std::endl;
-          init(mat);          
-          //std::cout << "End CPU precond" << std::endl;
-        }
-        
-        
-        void apply(vector<ScalarType> & vec) const
-        {
-          viennacl::copy(vec, temp_vec);
-          
-          for (std::size_t i=0; i<block_indices_.size(); ++i)
-          {
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU_blocks[i],
-                                                                                      LU_blocks[i].size(),
-                                                                                      LU_blocks[i].size());
-            detail::ilu_vector_range<STLVectorType>  vec_range(temp_vec,
-                                                            block_indices_[i].first,
-                                                            LU_blocks[i].size());
-            viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec_range);
-          }
-                    
-          viennacl::copy(temp_vec, vec);
-        }
-        
-      private:
-        void init(MatrixType const & mat)
-        {
-          InternalMatrixType temp(mat.size1());
-          //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
-
-          //copy to cpu:
-          viennacl::copy(mat, temp);
-          
-          for (std::size_t i=0; i<block_indices_.size(); ++i)
-          {
-            // Step 1: Extract blocks
-            std::size_t block_size = block_indices_[i].second - block_indices_[i].first;
-            InternalMatrixType mat_block(block_size);
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType>  temp_adapter(temp, temp.size(), temp.size());
-            detail::extract_block_matrix(temp_adapter, mat_block, block_indices_[i].first, block_indices_[i].second);
-            
-            
-            // Step 2: Precondition blocks:
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType>  mat_block_adapter(mat_block, block_size, block_size);
-            viennacl::tools::sparse_matrix_adapter<ScalarType>        LU_adapter(LU_blocks[i], block_size, block_size);
-            viennacl::linalg::precondition(mat_block_adapter, LU_adapter, tag_);
-          }
-          
-          //viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp, temp.size(), temp.size());
-          //viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
-          //viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
-          
-          temp_vec.resize(mat.size1());
-          
-          //copy resulting preconditioner back to gpu:
-          //copy(LU_cpu, LU);
-        }
-        
-        ILUTag const & tag_;
-        index_vector_type block_indices_;
-        std::vector< InternalMatrixType > LU_blocks;
-        mutable STLVectorType temp_vec;
-    };
-
-=======
                          ) : tag_(tag),
                              block_indices_(block_boundaries),
                              gpu_block_indices(viennacl::traits::context(mat)),
@@ -724,7 +451,6 @@ namespace viennacl
     };
 
 
->>>>>>> upstream/1.5.1
   }
 }
 
diff --git a/viennacl/linalg/detail/ilu/common.hpp b/viennacl/linalg/detail/ilu/common.hpp
index 0665325..e66e362 100644
--- a/viennacl/linalg/detail/ilu/common.hpp
+++ b/viennacl/linalg/detail/ilu/common.hpp
@@ -2,27 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -34,12 +24,6 @@
 
 #include <vector>
 #include <cmath>
-<<<<<<< HEAD
-#include "viennacl/forwards.h"
-#include "viennacl/tools/tools.hpp"
-
-#include <map>
-=======
 #include <iostream>
 #include <map>
 #include <list>
@@ -50,7 +34,6 @@
 
 #include "viennacl/linalg/host_based/common.hpp"
 #include "viennacl/linalg/misc_operations.hpp"
->>>>>>> upstream/1.5.1
 
 namespace viennacl
 {
@@ -58,109 +41,6 @@ namespace viennacl
   {
     namespace detail
     {
-<<<<<<< HEAD
-    
-      /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
-      * 
-      * Generic implementation using the iterator concept from boost::numeric::ublas. Could not find a better way for sparse matrices...
-      *
-      * @param row_iter   The row iterator
-      * @param k      The final row index
-      */
-      template <typename T>
-      void ilu_inc_row_iterator_to_row_index(T & row_iter, unsigned int k)
-      {
-        while (row_iter.index1() < k)
-          ++row_iter;
-      }
-      
-      /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
-      * 
-      * Specialization for the sparse matrix adapter shipped with ViennaCL
-      *
-      * @param row_iter   The row iterator
-      * @param k      The final row index
-      */
-      template <typename ScalarType>
-      void ilu_inc_row_iterator_to_row_index(viennacl::tools::sparse_matrix_adapter<ScalarType> & row_iter, unsigned int k)
-      {
-        row_iter += k - row_iter.index1();
-      }
-      
-      /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
-      * 
-      * Specialization for the const sparse matrix adapter shipped with ViennaCL
-      *
-      * @param row_iter   The row iterator
-      * @param k      The final row index
-      */
-      template <typename ScalarType>
-      void ilu_inc_row_iterator_to_row_index(viennacl::tools::const_sparse_matrix_adapter<ScalarType> & row_iter, unsigned int k)
-      {
-        row_iter += k - row_iter.index1();
-      }
-
-      /** @brief Generic inplace solution of a unit lower triangular system
-      *   
-      * @param mat  The system matrix
-      * @param vec  The right hand side vector
-      */
-      template<typename MatrixType, typename VectorType>
-      void ilu_inplace_solve(MatrixType const & mat, VectorType & vec, viennacl::linalg::unit_lower_tag)
-      {
-        typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
-        typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
-        
-        for (InputRowIterator row_iter = mat.begin1(); row_iter != mat.end1(); ++row_iter)
-        {
-          for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-          {
-            if (col_iter.index2() < col_iter.index1())
-              vec[col_iter.index1()] -= *col_iter * vec[col_iter.index2()];
-          }
-        }
-      }
-
-      /** @brief Generic inplace solution of a upper triangular system
-      *   
-      * @param mat  The system matrix
-      * @param vec  The right hand side vector
-      */
-      template<typename MatrixType, typename VectorType>
-      void ilu_inplace_solve(MatrixType const & mat, VectorType & vec, viennacl::linalg::upper_tag)
-      {
-        typedef typename MatrixType::const_reverse_iterator1    InputRowIterator;  //iterate along increasing row index
-        typedef typename MatrixType::const_iterator2            InputColIterator;  //iterate along increasing column index
-        typedef typename VectorType::value_type                 ScalarType;
-        
-        ScalarType diagonal_entry = 1.0;
-        
-        for (InputRowIterator row_iter = mat.rbegin1(); row_iter != mat.rend1(); ++row_iter)
-        {
-          for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-          {
-            if (col_iter.index2() > col_iter.index1())
-              vec[col_iter.index1()] -= *col_iter * vec[col_iter.index2()];
-            if (col_iter.index2() == col_iter.index1())
-              diagonal_entry = *col_iter;
-          }
-          vec[row_iter.index1()] /= diagonal_entry;
-        }
-      }
-
-      /** @brief Generic LU substitution
-      *   
-      * @param mat  The system matrix
-      * @param vec  The right hand side vector
-      */
-      template<typename MatrixType, typename VectorType>
-      void ilu_lu_substitute(MatrixType const & mat, VectorType & vec)
-      {
-        ilu_inplace_solve(mat, vec, unit_lower_tag());
-        ilu_inplace_solve(mat, vec, upper_tag());
-      }
-
-=======
 
 
       //
@@ -370,7 +250,6 @@ namespace viennacl
 
 
 
->>>>>>> upstream/1.5.1
     } // namespace detail
   } // namespace linalg
 } // namespace viennacl
diff --git a/viennacl/linalg/detail/ilu/ilu0.hpp b/viennacl/linalg/detail/ilu/ilu0.hpp
index 0bc923f..d9b11ed 100644
--- a/viennacl/linalg/detail/ilu/ilu0.hpp
+++ b/viennacl/linalg/detail/ilu/ilu0.hpp
@@ -3,15 +3,6 @@
 #define VIENNACL_LINALG_DETAIL_ILU0_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-   Institute for Analysis and Scientific Computing,
-   TU Wien.
-
-   -----------------
-   ViennaCL - The Vienna Computing Library
-   -----------------
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
@@ -20,28 +11,11 @@
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
->>>>>>> upstream/1.5.1
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
 
    (A list of authors and contributors can be found in the PDF manual)
 
-<<<<<<< HEAD
-License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/detail/ilu/ilu0.hpp
-  @brief Implementations of incomplete factorization preconditioners with static nonzero pattern. Contributed by Evan Bollig.
-
-  ILU0 (Incomplete LU with zero fill-in) 
-  - All preconditioner nonzeros exist at locations that were nonzero in the input matrix. 
-  - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
-
- Evan Bollig 3/30/12
- 
- Adapted from viennacl/linalg/detail/ilut.hpp
-
-=======
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
@@ -60,16 +34,10 @@ License:         MIT (X11), see file LICENSE in the base directory
 
  Low-level reimplementation by Karl Rupp in Nov 2012, increasing performance substantially. Also added level-scheduling.
 
->>>>>>> upstream/1.5.1
 */
 
 #include <vector>
 #include <cmath>
-<<<<<<< HEAD
-#include "viennacl/forwards.h"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/detail/ilu/common.hpp"
-=======
 #include <iostream>
 #include "viennacl/forwards.h"
 #include "viennacl/tools/tools.hpp"
@@ -78,7 +46,6 @@ License:         MIT (X11), see file LICENSE in the base directory
 #include "viennacl/backend/memory.hpp"
 
 #include "viennacl/linalg/host_based/common.hpp"
->>>>>>> upstream/1.5.1
 
 #include <map>
 
@@ -87,127 +54,11 @@ namespace viennacl
   namespace linalg
   {
 
-<<<<<<< HEAD
-    /** @brief A tag for incomplete LU factorization with threshold (ILUT)
-=======
     /** @brief A tag for incomplete LU factorization with static pattern (ILU0)
->>>>>>> upstream/1.5.1
     */
     class ilu0_tag
     {
       public:
-<<<<<<< HEAD
-        /** @brief The constructor.
-          *
-          * @param row_start     The starting row for the block to which we apply ILU
-          * @param row_end       The end column of the block to which we apply ILU
-          */
-        ilu0_tag(unsigned int row_start = 0, unsigned int row_end = -1)
-            : _row_start(row_start),  
-            _row_end(row_end) {}
-              
-      public: 
-        unsigned int _row_start, _row_end;
-    };
-
-
-    /** @brief Implementation of a ILU-preconditioner with static pattern
-      *
-      * refer to the Algorithm in Saad's book (1996 edition)
-      *
-      *  @param input   The input matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns
-      *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
-      *  @param tag     An ilu0_tag in order to dispatch among several other preconditioners.
-      */
-    template<typename MatrixType, typename LUType>
-    void precondition(MatrixType const & input, LUType & output, ilu0_tag const & tag)
-    {
-      typedef std::map<unsigned int, double>          SparseVector;
-      typedef typename SparseVector::iterator         SparseVectorIterator;
-      typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
-      typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
-      typedef typename LUType::iterator1              OutputRowIterator;  //iterate along increasing row index
-      typedef typename LUType::iterator2              OutputColIterator;  //iterate along increasing column index
-
-      output.clear();
-      assert(input.size1() == output.size1());
-      assert(input.size2() == output.size2());
-      output.resize(static_cast<unsigned int>(input.size1()), static_cast<unsigned int>(input.size2()), false);
-      SparseVector w;
-
-
-      std::map<double, unsigned int> temp_map;
-
-      // For i = 2, ... , N, DO
-      for (InputRowIterator row_iter = input.begin1(); row_iter != input.end1(); ++row_iter)
-      {
-        w.clear();
-        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-        {
-          // Only work on the block described by (row_start:row_end, row_start:row_end)
-          if ((static_cast<unsigned int>(row_iter.index1()) >= tag._row_start) && (static_cast<unsigned int>(row_iter.index1()) < tag._row_end))
-          {
-              if ((static_cast<unsigned int>(col_iter.index2()) >= tag._row_start) && (static_cast<unsigned int>(col_iter.index2()) < tag._row_end))
-              {
-                  w[static_cast<unsigned int>(col_iter.index2())] = *col_iter;
-              }
-          } 
-          else 
-          {
-              // Put identity on the excluded diagonal
-              w[static_cast<unsigned int>(row_iter.index1())] = 1.; 
-          }
-        }
-
-        //line 3:
-        OutputRowIterator row_iter_out = output.begin1();
-        for (SparseVectorIterator k = w.begin(); k != w.end(); ++k)
-        {
-          unsigned int index_k = k->first;
-          // Enforce i = 2 and 
-          if (index_k >= static_cast<unsigned int>(row_iter.index1()))
-              break;
-
-          detail::ilu_inc_row_iterator_to_row_index(row_iter_out, index_k);
-
-          //line 3: temp = a_ik = a_ik / a_kk
-          double temp = k->second / output(index_k, index_k);
-          if (output(index_k, index_k) == 0.0)
-          {
-              std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << index_k << "!" << std::endl;
-
-          }
-
-          for (OutputColIterator j = row_iter_out.begin(); j != row_iter_out.end(); ++j)
-          {
-              // Only fill if it a nonzero element of the input matrix
-              if (input(row_iter.index1(), j.index2())) {
-                  // Follow standard ILU algorithm (i.e., for j = k+1, ... , N)
-                  if (j.index2() > index_k) 
-                  {
-                      // set a_ij
-                      w[j.index2()] -= temp * *j;
-                  }
-              }
-          }
-          // Set a_ik
-          w[index_k] = temp;
-          
-        } //for k
-
-        // Write rows back to LU factor output
-        unsigned int k_count = 0; 
-        for (SparseVectorIterator k = w.begin(); k != w.end(); ++k )
-        {
-          output(static_cast<unsigned int>(row_iter.index1()), k->first) = static_cast<typename LUType::value_type>(w[k->first]);
-          k_count ++; 
-        }
-      } //for i
-    }
-
-
-    /** @brief ILUT preconditioner class, can be supplied to solve()-routines
-=======
         ilu0_tag(bool with_level_scheduling = false) : use_level_scheduling_(with_level_scheduling) {}
 
         bool use_level_scheduling() const { return use_level_scheduling_; }
@@ -291,7 +142,6 @@ namespace viennacl
 
 
     /** @brief ILU0 preconditioner class, can be supplied to solve()-routines
->>>>>>> upstream/1.5.1
     */
     template <typename MatrixType>
     class ilu0_precond
@@ -299,30 +149,15 @@ namespace viennacl
         typedef typename MatrixType::value_type      ScalarType;
 
       public:
-<<<<<<< HEAD
-        ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : _tag(tag), LU(mat.size1())
-        {
-            //initialize preconditioner:
-            //std::cout << "Start CPU precond" << std::endl;
-            init(mat);          
-=======
         ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : tag_(tag), LU()
         {
             //initialize preconditioner:
             //std::cout << "Start CPU precond" << std::endl;
             init(mat);
->>>>>>> upstream/1.5.1
             //std::cout << "End CPU precond" << std::endl;
         }
 
         template <typename VectorType>
-<<<<<<< HEAD
-            void apply(VectorType & vec) const
-            {
-                viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU);
-                viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec);
-            }
-=======
         void apply(VectorType & vec) const
         {
           unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
@@ -332,24 +167,10 @@ namespace viennacl
           viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), unit_lower_tag());
           viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), upper_tag());
         }
->>>>>>> upstream/1.5.1
 
       private:
         void init(MatrixType const & mat)
         {
-<<<<<<< HEAD
-            viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU);
-            viennacl::linalg::precondition(mat, LU_adapter, _tag);
-        }
-
-        ilu0_tag const & _tag;
-        
-        public: std::vector< std::map<unsigned int, ScalarType> > LU;
-    };
-
-
-    /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
-=======
           viennacl::context host_context(viennacl::MAIN_MEMORY);
           viennacl::switch_memory_context(LU, host_context);
 
@@ -364,7 +185,6 @@ namespace viennacl
 
 
     /** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
->>>>>>> upstream/1.5.1
       *
       *  Specialization for compressed_matrix
       */
@@ -373,15 +193,6 @@ namespace viennacl
     {
         typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
 
-<<<<<<< HEAD
-        public:
-        ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : _tag(tag), LU(mat.size1())
-        {
-            //initialize preconditioner:
-            //std::cout << "Start GPU precond" << std::endl;
-            init(mat);          
-            //std::cout << "End GPU precond" << std::endl;
-=======
       public:
         ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
         {
@@ -389,42 +200,10 @@ namespace viennacl
           //std::cout << "Start GPU precond" << std::endl;
           init(mat);
           //std::cout << "End GPU precond" << std::endl;
->>>>>>> upstream/1.5.1
         }
 
         void apply(vector<ScalarType> & vec) const
         {
-<<<<<<< HEAD
-            copy(vec, temp_vec);
-            //lu_substitute(LU, vec);
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU);
-            viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, temp_vec);
-
-            copy(temp_vec, vec);
-        }
-
-        private:
-        void init(MatrixType const & mat)
-        {
-            std::vector< std::map<unsigned int, ScalarType> > temp(mat.size1());
-            //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
-
-            //copy to cpu:
-            copy(mat, temp);
-
-            viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp);
-            viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU);
-            viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
-
-            temp_vec.resize(mat.size1());
-
-        }
-
-        ilu0_tag const & _tag;
-        //MatrixType LU;
-        public: std::vector< std::map<unsigned int, ScalarType> > LU;
-        private: mutable std::vector<ScalarType> temp_vec;
-=======
           viennacl::context host_context(viennacl::MAIN_MEMORY);
           if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
           {
@@ -588,7 +367,6 @@ namespace viennacl
         std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
         std::list< vcl_size_t > multifrontal_U_row_elimination_num_list_;
 
->>>>>>> upstream/1.5.1
     };
 
   }
diff --git a/viennacl/linalg/detail/ilu/ilut.hpp b/viennacl/linalg/detail/ilu/ilut.hpp
index e291583..311f0c1 100644
--- a/viennacl/linalg/detail/ilu/ilut.hpp
+++ b/viennacl/linalg/detail/ilu/ilut.hpp
@@ -2,27 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_ILUT_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -34,20 +24,14 @@
 
 #include <vector>
 #include <cmath>
-<<<<<<< HEAD
-=======
 #include <iostream>
->>>>>>> upstream/1.5.1
 #include "viennacl/forwards.h"
 #include "viennacl/tools/tools.hpp"
 
 #include "viennacl/linalg/detail/ilu/common.hpp"
-<<<<<<< HEAD
-=======
 #include "viennacl/compressed_matrix.hpp"
 
 #include "viennacl/linalg/host_based/common.hpp"
->>>>>>> upstream/1.5.1
 
 #include <map>
 
@@ -55,11 +39,7 @@ namespace viennacl
 {
   namespace linalg
   {
-<<<<<<< HEAD
-    
-=======
 
->>>>>>> upstream/1.5.1
     /** @brief A tag for incomplete LU factorization with threshold (ILUT)
     */
     class ilut_tag
@@ -67,13 +47,6 @@ namespace viennacl
       public:
         /** @brief The constructor.
         *
-<<<<<<< HEAD
-        * @param entries_per_row  Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
-        * @param drop_tolerance   The drop tolerance for ILUT
-        */
-        ilut_tag(unsigned int entries_per_row = 20,
-                 double drop_tolerance = 1e-4) : _entries_per_row(entries_per_row), _drop_tolerance(drop_tolerance) {}; 
-=======
         * @param entries_per_row        Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
         * @param drop_tolerance         The drop tolerance for ILUT
         * @param with_level_scheduling  Flag for enabling level scheduling on GPUs.
@@ -81,123 +54,10 @@ namespace viennacl
         ilut_tag(unsigned int entries_per_row = 20,
                  double drop_tolerance = 1e-4,
                  bool with_level_scheduling = false) : entries_per_row_(entries_per_row), drop_tolerance_(drop_tolerance), use_level_scheduling_(with_level_scheduling) {}
->>>>>>> upstream/1.5.1
 
         void set_drop_tolerance(double tol)
         {
           if (tol > 0)
-<<<<<<< HEAD
-            _drop_tolerance = tol;
-        }
-        double get_drop_tolerance() const { return _drop_tolerance; }
-        
-        void set_entries_per_row(unsigned int e)
-        {
-          if (e > 0)
-            _entries_per_row = e;
-        }
-
-        unsigned int get_entries_per_row() const { return _entries_per_row; }
-
-      private:
-        unsigned int _entries_per_row;
-        double _drop_tolerance;
-    };
-    
-        
-    /** @brief Implementation of a ILU-preconditioner with threshold
-    *
-    * refer to Algorithm 10.6 by Saad's book (1996 edition)
-    *
-    *  @param input   The input matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns
-    *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
-    *  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
-    */
-    template<typename MatrixType, typename LUType>
-    void precondition(MatrixType const & input, LUType & output, ilut_tag const & tag)
-    {
-      typedef std::map<unsigned int, double>          SparseVector;
-      typedef typename SparseVector::iterator         SparseVectorIterator;
-      typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
-      typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
-      typedef typename LUType::iterator1              OutputRowIterator;  //iterate along increasing row index
-      typedef typename LUType::iterator2              OutputColIterator;  //iterate along increasing column index
-
-      output.clear();
-      assert(input.size1() == output.size1());
-      assert(input.size2() == output.size2());
-      output.resize(static_cast<unsigned int>(input.size1()), static_cast<unsigned int>(input.size2()), false);
-      SparseVector w;
-      
-      std::map<double, unsigned int> temp_map;
-      
-      for (InputRowIterator row_iter = input.begin1(); row_iter != input.end1(); ++row_iter)
-      {
-    /*    if (i%10 == 0)
-      std::cout << i << std::endl;*/
-        
-        //line 2:
-        w.clear();
-        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-          w[static_cast<unsigned int>(col_iter.index2())] = *col_iter;
-
-        //line 3:
-        OutputRowIterator row_iter_out = output.begin1();
-        for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); ++w_k)
-        {
-          unsigned int k = w_k->first;
-          if (k >= static_cast<unsigned int>(row_iter.index1()))
-            break;
-          
-          
-          //while (row_iter_out.index1() < index_k)
-          //  ++row_iter_out;
-          //if (row_iter_out.index1() < index_k)
-          //  row_iter_out += index_k - row_iter_out.index1();
-          detail::ilu_inc_row_iterator_to_row_index(row_iter_out, k);
-          
-          //line 4:
-          double a_kk = output(k, k);
-          double temp = w_k->second / a_kk;
-          if (a_kk == 0.0)
-          {
-            std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << k 
-                      << " while processing line " << row_iter.index1() << "!" << std::endl;
-          }
-          
-          //line 5: (dropping rule to w_k)
-          if ( fabs(temp) > tag.get_drop_tolerance())
-          {
-            //line 7:
-            for (OutputColIterator u_k = row_iter_out.begin(); u_k != row_iter_out.end(); ++u_k)
-            {
-              if (u_k.index2() >= k)
-                w[u_k.index2()] -= temp * *u_k;
-            }
-          }
-        } //for k
-        
-        //Line 10: Apply a dropping rule to w
-        //Sort entries which are kept
-        temp_map.clear();
-        for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); )
-        {
-          if ( (fabs(w_k->second) < tag.get_drop_tolerance()) 
-               && (w_k->first != static_cast<unsigned int>(row_iter.index1())) //do not drop diagonal element!
-             )
-          { 
-            long index = w_k->first;
-            ++w_k;
-            w.erase(index);
-          }
-          else
-          {
-            double temp = fabs(w_k->second);
-            while (temp_map.find(temp) != temp_map.end())
-              temp *= 1.00000001; //make entry slightly larger to maintain uniqueness of the entry
-            temp_map[temp] = w_k->first;
-            ++w_k;
-=======
             drop_tolerance_ = tol;
         }
         double get_drop_tolerance() const { return drop_tolerance_; }
@@ -343,38 +203,10 @@ namespace viennacl
               throw "Triangular factor in ILUT singular!";
 
             temp_map.insert(std::make_pair(abs_w_k, std::make_pair(k, w_k_entry)));
->>>>>>> upstream/1.5.1
           }
         }
 
         //Lines 10-12: write the largest p values to L and U
-<<<<<<< HEAD
-        unsigned int written_L = 0;
-        unsigned int written_U = 0;
-        for (typename std::map<double, unsigned int>::reverse_iterator iter = temp_map.rbegin(); iter != temp_map.rend(); ++iter)
-        {
-          if (iter->second > static_cast<unsigned int>(row_iter.index1())) //entry for U
-          {
-            if (written_U < tag.get_entries_per_row())
-            {
-              output(static_cast<unsigned int>(row_iter.index1()), iter->second) = static_cast<typename LUType::value_type>(w[iter->second]);
-              ++written_U;
-            }
-          }
-          else if (iter->second == static_cast<unsigned int>(row_iter.index1()))
-          {
-            output(iter->second, iter->second) = static_cast<typename LUType::value_type>(w[static_cast<unsigned int>(row_iter.index1())]);
-          }
-          else //entry for L
-          {
-            if (written_L < tag.get_entries_per_row())
-            {
-              output(static_cast<unsigned int>(row_iter.index1()), iter->second) = static_cast<typename LUType::value_type>(w[iter->second]);
-              ++written_L;
-            }
-          }
-        }
-=======
         SizeType written_L = 0;
         SizeType written_U = 0;
         for (typename TemporarySortMap::reverse_iterator iter = temp_map.rbegin(); iter != temp_map.rend(); ++iter)
@@ -407,7 +239,6 @@ namespace viennacl
 
         w.clear(); //Line 13
 
->>>>>>> upstream/1.5.1
       } //for i
     }
 
@@ -418,37 +249,6 @@ namespace viennacl
     class ilut_precond
     {
       typedef typename MatrixType::value_type      ScalarType;
-<<<<<<< HEAD
-      
-      public:
-        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : _tag(tag), LU(mat.size1())
-        {
-          //initialize preconditioner:
-          //std::cout << "Start CPU precond" << std::endl;
-          init(mat);          
-          //std::cout << "End CPU precond" << std::endl;
-        }
-        
-        template <typename VectorType>
-        void apply(VectorType & vec) const
-        {
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, vec);
-        }
-        
-      private:
-        void init(MatrixType const & mat)
-        {
-          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::precondition(mat, LU_adapter, _tag);
-        }
-        
-        ilut_tag const & _tag;
-        std::vector< std::map<unsigned int, ScalarType> > LU;
-    };
-
-    
-=======
 
       public:
         ilut_precond(MatrixType const & mat, ilut_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
@@ -493,7 +293,6 @@ namespace viennacl
     };
 
 
->>>>>>> upstream/1.5.1
     /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
     *
     *  Specialization for compressed_matrix
@@ -502,51 +301,6 @@ namespace viennacl
     class ilut_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
     {
       typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
-<<<<<<< HEAD
-      
-      public:
-        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : _tag(tag), LU(mat.size1())
-        {
-          //initialize preconditioner:
-          //std::cout << "Start GPU precond" << std::endl;
-          init(mat);          
-          //std::cout << "End GPU precond" << std::endl;
-        }
-        
-        void apply(vector<ScalarType> & vec) const
-        {
-          copy(vec, temp_vec);
-          //lu_substitute(LU, vec);
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::detail::ilu_lu_substitute(LU_const_adapter, temp_vec);
-          
-          copy(temp_vec, vec);
-        }
-        
-      private:
-        void init(MatrixType const & mat)
-        {
-          std::vector< std::map<unsigned int, ScalarType> > temp(mat.size1());
-          //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
-
-          //copy to cpu:
-          copy(mat, temp);
-          
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp, temp.size(), temp.size());
-          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
-          
-          temp_vec.resize(mat.size1());
-          
-          //copy resulting preconditioner back to gpu:
-          //copy(LU_cpu, LU);
-        }
-        
-        ilut_tag const & _tag;
-        //MatrixType LU;
-        std::vector< std::map<unsigned int, ScalarType> > LU;
-        mutable std::vector<ScalarType> temp_vec;
-=======
 
       public:
         ilut_precond(MatrixType const & mat, ilut_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
@@ -718,7 +472,6 @@ namespace viennacl
         std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
         std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
         std::list< vcl_size_t > multifrontal_U_row_elimination_num_list_;
->>>>>>> upstream/1.5.1
     };
 
   }
diff --git a/viennacl/linalg/eig.hpp b/viennacl/linalg/eig.hpp
index 98ce0c0..8479f94 100644
--- a/viennacl/linalg/eig.hpp
+++ b/viennacl/linalg/eig.hpp
@@ -2,37 +2,23 @@
 #define VIENNACL_LINALG_EIG_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-/** @file eig.hpp
-=======
 /** @file viennacl/linalg/eig.hpp
->>>>>>> upstream/1.5.1
 *   @brief Convenience header file including all available eigenvalue algorithms
 */
 
@@ -40,8 +26,4 @@
 #include "viennacl/linalg/lanczos.hpp"
 #include "viennacl/linalg/power_iter.hpp"
 
-<<<<<<< HEAD
-#endif
-=======
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/linalg/gmres.hpp b/viennacl/linalg/gmres.hpp
index bc14c8a..7768763 100644
--- a/viennacl/linalg/gmres.hpp
+++ b/viennacl/linalg/gmres.hpp
@@ -189,19 +189,6 @@ namespace viennacl
       std::vector<CPU_ScalarType>  betas(krylov_dim);
 
       CPU_ScalarType norm_rhs = viennacl::linalg::norm_2(rhs);
-<<<<<<< HEAD
-      
-      if (norm_rhs == 0) //solution is zero if RHS norm is zero
-        return result;
-      
-      unsigned int k;
-      for (k = 0; k < krylov_dim; ++k)
-      {
-        R[k].resize(tag.krylov_dim()); 
-        viennacl::traits::resize(U[k], problem_size);
-      }
-=======
->>>>>>> upstream/1.5.1
 
       if (norm_rhs == 0) //solution is zero if RHS norm is zero
         return result;
diff --git a/viennacl/linalg/ilu.hpp b/viennacl/linalg/ilu.hpp
index 99d67c0..f913649 100644
--- a/viennacl/linalg/ilu.hpp
+++ b/viennacl/linalg/ilu.hpp
@@ -18,11 +18,7 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-/** @file ilu.hpp
-=======
 /** @file viennacl/linalg/ilu.hpp
->>>>>>> upstream/1.5.1
     @brief Implementations of incomplete factorization preconditioners. Convenience header file.
 */
 
diff --git a/viennacl/linalg/inner_prod.hpp b/viennacl/linalg/inner_prod.hpp
index f1b922a..ed810db 100644
--- a/viennacl/linalg/inner_prod.hpp
+++ b/viennacl/linalg/inner_prod.hpp
@@ -125,33 +125,6 @@ namespace viennacl
                                           const vector_base<NumericT>,
                                           viennacl::op_inner_prod >(vector1, vector2);
     }
-<<<<<<< HEAD
-    
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                 const viennacl::vector_range<VectorType>,
-                                 viennacl::op_inner_prod >
-    inner_prod(viennacl::vector_range<VectorType> const & vector1,
-               viennacl::vector_range<VectorType> const & vector2)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                          const viennacl::vector_range<VectorType>,
-                                          viennacl::op_inner_prod >(vector1, vector2);
-    }
-
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                 const viennacl::vector_slice<VectorType>,
-                                 viennacl::op_inner_prod >
-    inner_prod(viennacl::vector_slice<VectorType> const & vector1,
-               viennacl::vector_slice<VectorType> const & vector2)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                          const viennacl::vector_slice<VectorType>,
-                                          viennacl::op_inner_prod >(vector1, vector2);
-    }
-
-=======
 
     // expression on rhs:
     template <typename NumericT, typename LHS, typename RHS, typename OP>
@@ -195,7 +168,6 @@ namespace viennacl
     }
 
 
->>>>>>> upstream/1.5.1
   } // end namespace linalg
 } // end namespace viennacl
 #endif
diff --git a/viennacl/linalg/jacobi_precond.hpp b/viennacl/linalg/jacobi_precond.hpp
index 8bb5de3..bc268d9 100644
--- a/viennacl/linalg/jacobi_precond.hpp
+++ b/viennacl/linalg/jacobi_precond.hpp
@@ -121,27 +121,8 @@ namespace viennacl
         template <unsigned int ALIGNMENT>
         void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
         {
-<<<<<<< HEAD
-          assert(viennacl::traits::size1(system_matrix) == viennacl::traits::size(vec));
-          
-          //run kernel:
-          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
-                                                                "diag_precond");
-
-          viennacl::ocl::enqueue(
-             k(viennacl::traits::handle(diag_A_inv),
-                cl_uint(viennacl::traits::start(diag_A_inv)),
-                cl_uint(viennacl::traits::stride(diag_A_inv)),
-                cl_uint(viennacl::traits::size(diag_A_inv)),
-               viennacl::traits::handle(vec),
-                cl_uint(viennacl::traits::start(vec)),
-                cl_uint(viennacl::traits::stride(vec)),
-                cl_uint(viennacl::traits::size(vec)) )
-                                );        
-=======
           assert(viennacl::traits::size(diag_A) == viennacl::traits::size(vec) && bool("Size mismatch"));
           vec = element_div(vec, diag_A);
->>>>>>> upstream/1.5.1
         }
 
       private:
diff --git a/viennacl/linalg/lanczos.hpp b/viennacl/linalg/lanczos.hpp
index 4379f64..2785435 100644
--- a/viennacl/linalg/lanczos.hpp
+++ b/viennacl/linalg/lanczos.hpp
@@ -2,27 +2,17 @@
 #define VIENNACL_LINALG_LANCZOS_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -30,19 +20,11 @@
 
 /** @file viennacl/linalg/lanczos.hpp
 *   @brief Generic interface for the Lanczos algorithm.
-<<<<<<< HEAD
-* 
-*   Contributed by Guenther Mader and Astrid Rupp.
-*/
-
-#include <math.h>    //for sqrt()
-=======
 *
 *   Contributed by Guenther Mader and Astrid Rupp.
 */
 
 #include <cmath>
->>>>>>> upstream/1.5.1
 #include <vector>
 #include "viennacl/vector.hpp"
 #include "viennacl/compressed_matrix.hpp"
@@ -58,27 +40,12 @@
 #include <boost/numeric/ublas/matrix_expression.hpp>
 #include <boost/numeric/ublas/matrix_sparse.hpp>
 #include <boost/numeric/ublas/vector.hpp>
-<<<<<<< HEAD
-#include <boost/numeric/ublas/operation.hpp> 
-=======
 #include <boost/numeric/ublas/operation.hpp>
->>>>>>> upstream/1.5.1
 #include <boost/numeric/ublas/vector_expression.hpp>
 #include <boost/numeric/ublas/io.hpp>
 
 namespace viennacl
 {
-<<<<<<< HEAD
-  namespace linalg 
-  {
-    
-    /** @brief A tag for the lanczos algorithm. 
-    */
-    class lanczos_tag 
-    {
-      public:
-        
-=======
   namespace linalg
   {
 
@@ -88,7 +55,6 @@ namespace viennacl
     {
       public:
 
->>>>>>> upstream/1.5.1
         enum
         {
           partial_reorthogonalization = 0,
@@ -99,17 +65,6 @@ namespace viennacl
         /** @brief The constructor
         *
         * @param factor                 Exponent of epsilon - tolerance for batches of Reorthogonalization
-<<<<<<< HEAD
-        * @param num_eigenvalues        Number of eigenvalues to be returned
-        * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
-        * @param krylov_size            Maximal krylov-space size
-        */
-
-        lanczos_tag(double factor = 0.75,
-                    std::size_t numeig = 10,
-                    int met = 0,
-                    std::size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {};
-=======
         * @param numeig                 Number of eigenvalues to be returned
         * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
         * @param krylov                 Maximum krylov-space size
@@ -119,55 +74,18 @@ namespace viennacl
                     vcl_size_t numeig = 10,
                     int met = 0,
                     vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {}
->>>>>>> upstream/1.5.1
 
         /** @brief Sets the number of eigenvalues */
         void num_eigenvalues(int numeig){ num_eigenvalues_ = numeig; }
 
           /** @brief Returns the number of eigenvalues */
-<<<<<<< HEAD
-        std::size_t num_eigenvalues() const { return num_eigenvalues_; }
-=======
         vcl_size_t num_eigenvalues() const { return num_eigenvalues_; }
->>>>>>> upstream/1.5.1
 
           /** @brief Sets the exponent of epsilon */
         void factor(double fct) { factor_ = fct; }
 
         /** @brief Returns the exponent */
         double factor() const { return factor_; }
-<<<<<<< HEAD
-        
-        /** @brief Sets the size of the kylov space */
-        void krylov_size(int max) { krylov_size_ = max; }
-
-        /** @brief Returns the size of the kylov space */  
-        std::size_t  krylov_size() const { return krylov_size_; }
-
-        /** @brief Sets the reorthogonalization method */ 
-        void method(int met){ method_ = met; }
-        
-        /** @brief Returns the reorthogonalization method */ 
-        int method() const { return method_; }
-
-
-      private: 
-        double factor_;
-        std::size_t num_eigenvalues_;
-        int method_; // see enum defined above for possible values
-        std::size_t krylov_size_;
-
-    };
-    
-    
-    namespace detail
-    {
-      /** 
-      *   @brief Implementation of the Lanczos PRO algorithm
-      *   
-      *   @param A            The system matrix
-      *   @param r            Random start vector 
-=======
 
         /** @brief Sets the size of the kylov space */
         void krylov_size(int max) { krylov_size_ = max; }
@@ -198,7 +116,6 @@ namespace viennacl
       *
       *   @param A            The system matrix
       *   @param r            Random start vector
->>>>>>> upstream/1.5.1
       *   @param size         Size of krylov-space
       *   @param tag          Lanczos_tag with several options for the algorithm
       *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
@@ -208,31 +125,6 @@ namespace viennacl
       std::vector<
               typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
               >
-<<<<<<< HEAD
-      lanczosPRO (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
-      {
-    
-        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
-        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-
-        
-        // generation of some random numbers, used for lanczos PRO algorithm
-        boost::mt11213b mt;
-        boost::normal_distribution<double> N(0, 1);
-        boost::bernoulli_distribution<double> B(0.5);
-        boost::triangle_distribution<double> T(-1, 0, 1);
-
-        boost::variate_generator<boost::mt11213b&, boost::normal_distribution<double> >     get_N(mt, N);
-        boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<double> >  get_B(mt, B);
-        boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<double> >   get_T(mt, T);
-
-        
-        long i, j, k, index, retry, reorths;
-        std::vector<long> l_bound(size/2), u_bound(size/2);
-        bool second_step;
-        double squ_eps, eta, temp, eps, retry_th;
-        long n = r.size();
-=======
       lanczosPRO (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag const & tag)
       {
 
@@ -256,7 +148,6 @@ namespace viennacl
         bool second_step;
         CPU_ScalarType squ_eps, eta, temp, eps, retry_th;
         vcl_size_t n = r.size();
->>>>>>> upstream/1.5.1
         std::vector< std::vector<CPU_ScalarType> > w(2, std::vector<CPU_ScalarType>(size));
         CPU_ScalarType cpu_beta;
 
@@ -270,22 +161,6 @@ namespace viennacl
         boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
 
         second_step = false;
-<<<<<<< HEAD
-        eps = std::numeric_limits<double>::epsilon();
-        squ_eps = sqrt(eps);
-        retry_th = 1e-2;
-        eta =  exp(log(eps) * tag.factor());
-        reorths = 0;
-        retry = 0;
-        
-        vcl_beta = viennacl::linalg::norm_2(r);
-        
-        r /= vcl_beta;
-        
-        detail::copy_vec_to_vec(r,s);
-        boost::numeric::ublas::column(Q, 0) = s;
-        
-=======
         eps = std::numeric_limits<CPU_ScalarType>::epsilon();
         squ_eps = std::sqrt(eps);
         retry_th = 1e-2;
@@ -300,21 +175,14 @@ namespace viennacl
         detail::copy_vec_to_vec(r,s);
         boost::numeric::ublas::column(Q, 0) = s;
 
->>>>>>> upstream/1.5.1
         VectorT u = viennacl::linalg::prod(A, r);
         vcl_alpha = viennacl::linalg::inner_prod(u, r);
         alphas.push_back(vcl_alpha);
         w[0][0] = 1;
         betas.push_back(vcl_beta);
-<<<<<<< HEAD
-        
-        long batches = 0;
-        for(i = 1;i < size; i++)
-=======
 
         long batches = 0;
         for(i = 1;i < static_cast<long>(size); i++)
->>>>>>> upstream/1.5.1
         {
           r = u - vcl_alpha * r;
           vcl_beta = viennacl::linalg::norm_2(r);
@@ -326,17 +194,10 @@ namespace viennacl
           w[index][i] = 1;
           k = (i + 1) % 2;
           w[index][0] = (betas[1] * w[k][1] + (alphas[0] - vcl_alpha) * w[k][0] - betas[i - 1] * w[index][0]) / vcl_beta + eps * 0.3 * get_N() * (betas[1] + vcl_beta);
-<<<<<<< HEAD
-          
-          for(j = 1;j < i - 1;j++)
-          {
-                  w[index][j] = (betas[j + 1] * w[k][j + 1] + (alphas[j] - vcl_alpha) * w[k][j] + betas[j] * w[k][j - 1] - betas[i - 1] * w[index][j]) / vcl_beta + eps * 0.3 * get_N() * (betas[j + 1] + vcl_beta);      
-=======
 
           for(j = 1;j < i - 1;j++)
           {
                   w[index][j] = (betas[j + 1] * w[k][j + 1] + (alphas[j] - vcl_alpha) * w[k][j] + betas[j] * w[k][j - 1] - betas[i - 1] * w[index][j]) / vcl_beta + eps * 0.3 * get_N() * (betas[j + 1] + vcl_beta);
->>>>>>> upstream/1.5.1
           }
           w[index][i - 1] = 0.6 * eps * n * get_N() * betas[1] / vcl_beta;
 
@@ -351,11 +212,7 @@ namespace viennacl
               {
                 detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
                 inner_rt = viennacl::linalg::inner_prod(r,t);
-<<<<<<< HEAD
-                r = r - inner_rt * t;   
-=======
                 r = r - inner_rt * t;
->>>>>>> upstream/1.5.1
                 w[index][k] = 1.5 * eps * get_N();
                 reorths++;
               }
@@ -368,13 +225,8 @@ namespace viennacl
           batches = 0;
 
           for(j = 0;j < i;j++)
-<<<<<<< HEAD
-          { 
-            if(fabs(w[index][j]) >= squ_eps)
-=======
           {
             if(std::fabs(w[index][j]) >= squ_eps)
->>>>>>> upstream/1.5.1
             {
               detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, j), t);
               inner_rt = viennacl::linalg::inner_prod(r,t);
@@ -382,11 +234,7 @@ namespace viennacl
               w[index][j] = 1.5 * eps * get_N();
               k = j - 1;
               reorths++;
-<<<<<<< HEAD
-              while(k >= 0 && fabs(w[index][k]) > eta)
-=======
               while(k >= 0 && std::fabs(w[index][k]) > eta)
->>>>>>> upstream/1.5.1
               {
                 detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
                 inner_rt = viennacl::linalg::inner_prod(r,t);
@@ -397,21 +245,12 @@ namespace viennacl
               }
               l_bound[batches] = k + 1;
               k = j + 1;
-<<<<<<< HEAD
-              
-              while(k < i && fabs(w[index][k]) > eta)
-              {
-                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
-                inner_rt = viennacl::linalg::inner_prod(r,t);
-                r = r - inner_rt * t;   
-=======
 
               while(k < i && std::fabs(w[index][k]) > eta)
               {
                 detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
                 inner_rt = viennacl::linalg::inner_prod(r,t);
                 r = r - inner_rt * t;
->>>>>>> upstream/1.5.1
                 w[index][k] = 1.5 * eps * get_N();
                 k++;
                 reorths++;
@@ -421,11 +260,7 @@ namespace viennacl
               j = k;
             }
           }
-<<<<<<< HEAD
-          
-=======
 
->>>>>>> upstream/1.5.1
           if(batches > 0)
           {
             temp = viennacl::linalg::norm_2(r);
@@ -448,11 +283,7 @@ namespace viennacl
               vcl_beta = vcl_beta * temp;
             }
           }
-<<<<<<< HEAD
-      
-=======
 
->>>>>>> upstream/1.5.1
           detail::copy_vec_to_vec(r,s);
           boost::numeric::ublas::column(Q, i) = s;
 
@@ -465,19 +296,6 @@ namespace viennacl
         }
 
         return bisect(alphas, betas);
-<<<<<<< HEAD
-      
-      }
-
-
-      /** 
-      *   @brief Implementation of the lanczos algorithm without reorthogonalization
-      * 
-      *   @param A            The system matrix
-      *   @param r            Random start vector 
-      *   @param size         Size of krylov-space
-      *   @param tag          Lanczos_tag with several options for the algorithm
-=======
 
       }
 
@@ -488,38 +306,23 @@ namespace viennacl
       *   @param A            The system matrix
       *   @param r            Random start vector
       *   @param size         Size of krylov-space
->>>>>>> upstream/1.5.1
       *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
       */
       template< typename MatrixT, typename VectorT >
       std::vector<
               typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
               >
-<<<<<<< HEAD
-      lanczos (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
-      {
-      
-        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
-        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-
-        long i;
-=======
       lanczos (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag)
       {
 
         typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 
->>>>>>> upstream/1.5.1
         ScalarType vcl_beta;
         ScalarType vcl_alpha;
         std::vector<CPU_ScalarType> alphas, betas;
         CPU_ScalarType norm;
-<<<<<<< HEAD
-        long n = r.size();
-=======
         vcl_size_t n = r.size();
->>>>>>> upstream/1.5.1
         VectorT u(n), t(n);
         boost::numeric::ublas::vector<CPU_ScalarType> s(r.size()), u_zero(n), q(n);
         boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
@@ -527,13 +330,8 @@ namespace viennacl
         u_zero = boost::numeric::ublas::zero_vector<CPU_ScalarType>(n);
         detail::copy_vec_to_vec(u_zero, u);
         norm = norm_2(r);
-<<<<<<< HEAD
-        
-        for(i = 0;i < size; i++)
-=======
 
         for(vcl_size_t i = 0;i < size; i++)
->>>>>>> upstream/1.5.1
         {
           r /= norm;
           vcl_beta = norm;
@@ -558,79 +356,44 @@ namespace viennacl
         return bisect(alphas, betas);
       }
 
-<<<<<<< HEAD
-      /** 
-      *   @brief Implementation of the Lanczos FRO algorithm
-      *   
-      *   @param A            The system matrix 
-      *   @param r            Random start vector 
-      *   @param size         Size of krylov-space
-      *   @param tag          Lanczos_tag with several options for the algorithm
-=======
       /**
       *   @brief Implementation of the Lanczos FRO algorithm
       *
       *   @param A            The system matrix
       *   @param r            Random start vector
       *   @param size         Size of krylov-space
->>>>>>> upstream/1.5.1
       *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
       */
       template< typename MatrixT, typename VectorT >
       std::vector<
               typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
               >
-<<<<<<< HEAD
-      lanczosFRO (MatrixT const& A, VectorT & r, int size, lanczos_tag const & tag)
-      {
-        
-        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
-        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-        
-=======
       lanczosFRO (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag)
       {
 
         typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 
->>>>>>> upstream/1.5.1
           CPU_ScalarType temp;
           CPU_ScalarType norm;
           ScalarType vcl_beta;
           ScalarType vcl_alpha;
           std::vector<CPU_ScalarType> alphas, betas;
-<<<<<<< HEAD
-          long n = r.size();
-=======
           vcl_size_t n = r.size();
->>>>>>> upstream/1.5.1
           VectorT u(n), t(n);
           ScalarType inner_rt;
           boost::numeric::ublas::vector<CPU_ScalarType> u_zero(n), s(r.size()), q(n);
           boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
-<<<<<<< HEAD
-          
-=======
 
->>>>>>> upstream/1.5.1
           long reorths = 0;
           norm = norm_2(r);
 
 
-<<<<<<< HEAD
-          for(long i = 0; i < size; i++)
-          {
-            r /= norm;
-
-            for(long j = 0; j < i; j++)
-=======
           for(vcl_size_t i = 0; i < size; i++)
           {
             r /= norm;
 
             for(vcl_size_t j = 0; j < i; j++)
->>>>>>> upstream/1.5.1
             {
               q = boost::numeric::ublas::column(Q, j);
               detail::copy_vec_to_vec(q, t);
@@ -654,17 +417,6 @@ namespace viennacl
             alphas.push_back(vcl_alpha);
             betas.push_back(vcl_beta);
           }
-<<<<<<< HEAD
-          
-          return bisect(alphas, betas);
-      }
-
-    } // end namespace detail    
-
-    /** 
-    *   @brief Implementation of the calculation of eigenvalues using lanczos
-    *   
-=======
 
           return bisect(alphas, betas);
       }
@@ -674,7 +426,6 @@ namespace viennacl
     /**
     *   @brief Implementation of the calculation of eigenvalues using lanczos
     *
->>>>>>> upstream/1.5.1
     *   @param matrix        The system matrix
     *   @param tag           Tag with several options for the lanczos algorithm
     *   @return              Returns the n largest eigenvalues (n defined in the lanczos_tag)
@@ -686,31 +437,6 @@ namespace viennacl
       typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
       typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
-<<<<<<< HEAD
-    
-      boost::mt11213b mt;
-      boost::normal_distribution<double> N(0, 1);
-      boost::bernoulli_distribution<double> B(0.5);
-      boost::triangle_distribution<double> T(-1, 0, 1);
-
-      boost::variate_generator<boost::mt11213b&, boost::normal_distribution<double> >     get_N(mt, N);
-      boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<double> >  get_B(mt, B);
-      boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<double> >   get_T(mt, T);
-      
-      std::vector<CPU_ScalarType> eigenvalues;
-      std::size_t matrix_size = matrix.size1();
-      VectorT r(matrix_size);
-      std::vector<CPU_ScalarType> s(matrix_size);
-      
-      for(std::size_t i=0; i<s.size(); ++i)
-        s[i] = 3.0 * get_B() + get_T() - 1.5; 
-
-      detail::copy_vec_to_vec(s,r);
-
-      std::size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
-                                                                  : tag.krylov_size();
-      
-=======
 
       boost::mt11213b mt;
       boost::normal_distribution<CPU_ScalarType> N(0, 1);
@@ -734,7 +460,6 @@ namespace viennacl
       vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
                                                                   : tag.krylov_size();
 
->>>>>>> upstream/1.5.1
       switch(tag.method())
       {
         case lanczos_tag::partial_reorthogonalization:
@@ -745,30 +470,11 @@ namespace viennacl
           break;
         case lanczos_tag::no_reorthogonalization:
           eigenvalues = detail::lanczos(matrix, r, size_krylov, tag);
-<<<<<<< HEAD
-          break;                
-=======
           break;
->>>>>>> upstream/1.5.1
       }
 
       std::vector<CPU_ScalarType> largest_eigenvalues;
 
-<<<<<<< HEAD
-      for(std::size_t i = 1; i<=tag.num_eigenvalues(); i++)
-        largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
-    
-    
-      return largest_eigenvalues;
-    }
-    
-    
-
-    
-  } // end namespace linalg
-} // end namespace viennacl
-#endif
-=======
       for(vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++)
         largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
 
@@ -782,4 +488,3 @@ namespace viennacl
   } // end namespace linalg
 } // end namespace viennacl
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/linalg/matrix_operations.hpp b/viennacl/linalg/matrix_operations.hpp
index 5607ace..5ca490e 100644
--- a/viennacl/linalg/matrix_operations.hpp
+++ b/viennacl/linalg/matrix_operations.hpp
@@ -34,23 +34,6 @@
 #include "viennacl/traits/start.hpp"
 #include "viennacl/traits/handle.hpp"
 #include "viennacl/traits/stride.hpp"
-<<<<<<< HEAD
-#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
-#include "viennacl/tools/matrix_prod_kernel_class_deducer.hpp"
-#include "viennacl/linalg/kernels/vector_kernels.h"
-#include "viennacl/linalg/kernels/matrix_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_col_kernels.h"
-
-#include "viennacl/linalg/kernels/matrix_prod_col_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_row_kernels.h"
-
-#include "viennacl/linalg/kernels/matrix_prod_row_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_row_kernels.h"
-=======
 #include "viennacl/vector.hpp"
 #include "viennacl/linalg/host_based/matrix_operations.hpp"
 
@@ -61,108 +44,19 @@
 #ifdef VIENNACL_WITH_CUDA
   #include "viennacl/linalg/cuda/matrix_operations.hpp"
 #endif
->>>>>>> upstream/1.5.1
 
 namespace viennacl
 {
   namespace linalg
   {
-<<<<<<< HEAD
-    
-    /** @brief Assign a matrix (-range/-slice) to another matrix (-range/slice).
-    *
-    * Computes mat1 = mat2.
-    * 
-    * @param mat1  The destination matrix
-    * @param mat2  The source matrix
-    */
-    template <typename M1, typename M2>
-    typename viennacl::enable_if< viennacl::is_matrix<M1>::value
-                                  && viennacl::is_matrix<M2>::value
-                                >::type
-    assign(M1       & mat1,
-           M2 const & mat2)
-    {
-      typedef typename viennacl::result_of::cpu_value_type<M1>::type        value_type;
-      
-      assert( (viennacl::traits::size1(mat1) == viennacl::traits::size1(mat2))
-             && (viennacl::traits::size2(mat1) == viennacl::traits::size2(mat2))
-             && "Incompatible matrix sizes in assign()!");
-      
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< M1 >::ResultType    KernelClass;
-      
-      
-      std::size_t block_size = 16;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "assign");
-      k.global_work_size(0, block_size*block_size);
-      k.global_work_size(1, block_size*block_size);
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(mat1), 
-                                        cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)), 
-                                        cl_uint(viennacl::traits::stride1(mat1)),             cl_uint(viennacl::traits::stride2(mat1)),
-                                        cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
-                                        cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
-                                 viennacl::traits::handle(mat2), 
-                                        cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)), 
-                                        cl_uint(viennacl::traits::stride1(mat2)),             cl_uint(viennacl::traits::stride2(mat2)),
-                                        cl_uint(viennacl::traits::size1(mat2)),            cl_uint(viennacl::traits::size2(mat2)),
-                                        cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2))
-                                )
-                              );
-    }
-    
-    
-    //
-    ///////////////////////////////////// addition and subtraction///////////////////////////////////////////////
-    //
-    
-    namespace detail
-=======
 
     template <typename NumericT, typename F,
               typename ScalarType1>
     void am(matrix_base<NumericT, F> & mat1,
             matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
->>>>>>> upstream/1.5.1
     {
       switch (viennacl::traits::handle(mat1).get_active_handle_id())
       {
-<<<<<<< HEAD
-        assert(result.size1() == mat1.size1());
-        assert(result.size2() == mat1.size2());
-        assert(result.size1() == mat2.size1());
-        assert(result.size2() == mat2.size2());
-
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(mat1), 
-                                        cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)), 
-                                        cl_uint(viennacl::traits::stride1(mat1)),             cl_uint(viennacl::traits::stride2(mat1)),
-                                        cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
-                                        cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
-                                viennacl::traits::handle(mat2), 
-                                        cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)), 
-                                        cl_uint(viennacl::traits::stride1(mat2)),             cl_uint(viennacl::traits::stride2(mat2)),
-                                        cl_uint(viennacl::traits::size1(mat2)),            cl_uint(viennacl::traits::size2(mat2)),
-                                        cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
-                                viennacl::traits::handle(result), 
-                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
-                                        cl_uint(viennacl::traits::stride1(result)),           cl_uint(viennacl::traits::stride2(result)),
-                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
-                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
-                                )
-                              );        
-=======
         case viennacl::MAIN_MEMORY:
           viennacl::linalg::host_based::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
           break;
@@ -180,7 +74,6 @@ namespace viennacl
           throw memory_exception("not initialised!");
         default:
           throw memory_exception("not implemented");
->>>>>>> upstream/1.5.1
       }
     }
 
@@ -193,33 +86,6 @@ namespace viennacl
     {
       switch (viennacl::traits::handle(mat1).get_active_handle_id())
       {
-<<<<<<< HEAD
-        assert(viennacl::traits::size1(result) == viennacl::traits::size1(mat2));
-        assert(viennacl::traits::size2(result) == viennacl::traits::size2(mat2));
-
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
-                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
-                                        cl_uint(viennacl::traits::stride1(result)),           cl_uint(viennacl::traits::stride2(result)),
-                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
-                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
-                                viennacl::traits::handle(mat2), 
-                                        cl_uint(viennacl::traits::start1(mat2)),            cl_uint(viennacl::traits::start2(mat2)), 
-                                        cl_uint(viennacl::traits::stride1(mat2)),              cl_uint(viennacl::traits::stride2(mat2)),
-                                        cl_uint(viennacl::traits::size1(mat2)),             cl_uint(viennacl::traits::size2(mat2)),
-                                        cl_uint(viennacl::traits::internal_size1(mat2)),    cl_uint(viennacl::traits::internal_size2(mat2))
-                                )
-                              );
-=======
         case viennacl::MAIN_MEMORY:
           viennacl::linalg::host_based::ambm(mat1,
                                              mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
@@ -243,7 +109,6 @@ namespace viennacl
           throw memory_exception("not initialised!");
         default:
           throw memory_exception("not implemented");
->>>>>>> upstream/1.5.1
       }
     }
 
@@ -341,26 +206,6 @@ namespace viennacl
     {
       switch (viennacl::traits::handle(v).get_active_handle_id())
       {
-<<<<<<< HEAD
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-          
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
-                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
-                                        cl_uint(viennacl::traits::stride1(result)),           cl_uint(viennacl::traits::stride2(result)),
-                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
-                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
-                                val)
-                              );
-=======
         case viennacl::MAIN_MEMORY:
           viennacl::linalg::host_based::matrix_diag_from_vector(v, k, A);
           break;
@@ -378,7 +223,6 @@ namespace viennacl
           throw memory_exception("not initialised!");
         default:
           throw memory_exception("not implemented");
->>>>>>> upstream/1.5.1
       }
     }
 
@@ -493,27 +337,6 @@ namespace viennacl
 
 
     // A * x
-<<<<<<< HEAD
-    /** @brief Returns a proxy class that represents matrix-vector multiplication
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
-                                         const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                         op_prod >(mat, vec);
-    }
-
-=======
->>>>>>> upstream/1.5.1
 
     /** @brief Carries out matrix-vector multiplication
     *
@@ -523,39 +346,6 @@ namespace viennacl
     * @param vec    The vector
     * @param result The result vector
     */
-<<<<<<< HEAD
-    template <typename MatrixType, typename VectorType1, typename VectorType2>
-    typename viennacl::enable_if<   viennacl::is_matrix<MatrixType>::value 
-                                  && viennacl::is_vector<VectorType1>::value 
-                                  && viennacl::is_vector<VectorType2>::value >::type
-    prod_impl(const MatrixType & mat, 
-              const VectorType1 & vec, 
-                    VectorType2 & result)
-    {
-      assert(mat.size2() == vec.size());
-      // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
-      assert(viennacl::traits::handle(vec).get() != viennacl::traits::handle(result).get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
-      //result.resize(mat.size1());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "vec_mul");
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(mat),
-                               cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)), 
-                               cl_uint(viennacl::traits::stride1(mat)),           cl_uint(viennacl::traits::stride2(mat)),
-                               cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
-                               cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
-                               viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)), 
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result))
-                             ) );
-    }
-=======
     template <typename NumericT, typename F>
     void prod_impl(const matrix_base<NumericT, F> & mat,
                    const vector_base<NumericT> & vec,
@@ -563,7 +353,6 @@ namespace viennacl
     {
       assert( (viennacl::traits::size1(mat) == viennacl::traits::size(result)) && bool("Size check failed at v1 = prod(A, v2): size1(A) != size(v1)"));
       assert( (viennacl::traits::size2(mat) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = prod(A, v2): size2(A) != size(v2)"));
->>>>>>> upstream/1.5.1
 
       switch (viennacl::traits::handle(mat).get_active_handle_id())
       {
@@ -587,6 +376,7 @@ namespace viennacl
       }
     }
 
+
     // trans(A) * x
 
     /** @brief Carries out matrix-vector multiplication with a transposed matrix
@@ -602,37 +392,8 @@ namespace viennacl
                    const vector_base<NumericT> & vec,
                          vector_base<NumericT> & result)
     {
-<<<<<<< HEAD
-      assert(mat.size1() == vec.size());  //remember: mat is transposed!
-      // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
-      assert(vec.handle().get() != result.handle().get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
-      result.resize(mat.size2());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "trans_vec_mul");
-      
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(mat),
-                               cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)), 
-                               cl_uint(viennacl::traits::stride1(mat)),           cl_uint(viennacl::traits::stride2(mat)),
-                               cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
-                               cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
-                               viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)), 
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result))
-                             ) );
-    }
-
-
-=======
       assert( (viennacl::traits::size1(mat_trans.lhs()) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = trans(A) * v2: size1(A) != size(v2)"));
       assert( (viennacl::traits::size2(mat_trans.lhs()) == viennacl::traits::size(result)) && bool("Size check failed at v1 = trans(A) * v2: size2(A) != size(v1)"));
->>>>>>> upstream/1.5.1
 
       switch (viennacl::traits::handle(mat_trans.lhs()).get_active_handle_id())
       {
@@ -660,155 +421,12 @@ namespace viennacl
     //
     /////////////////////////   matrix-matrix products /////////////////////////////////
     //
-<<<<<<< HEAD
-    
-    namespace detail
-    {
-      // C = A * B and possibly transposed variants
-      template <typename T1, typename T2, typename T3 >
-      void prod_slow_kernel(const T1 & A, 
-                            const T2 & B, 
-                            T3 & C,
-                            std::string kernel_name)
-      {
-        typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
-        
-        typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
-        KernelClass::init();
-        
-        //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        
-        k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(C), 16));
-        k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(C), 16));
-        k.local_work_size(0, 16);
-        k.local_work_size(1, 16);
-        
-        cpu_value_type alpha(1);
-        cpu_value_type beta(0);
-        
-        viennacl::ocl::enqueue(k(alpha,
-                                 viennacl::traits::handle(A), 
-                                        cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)), 
-                                        cl_uint(viennacl::traits::stride1(A)),             cl_uint(viennacl::traits::stride2(A)),
-                                        cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
-                                        cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
-                                 viennacl::traits::handle(B), 
-                                        cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)), 
-                                        cl_uint(viennacl::traits::stride1(B)),             cl_uint(viennacl::traits::stride2(B)),
-                                        cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
-                                        cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
-                                 beta,
-                                 viennacl::traits::handle(C), 
-                                        cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)), 
-                                        cl_uint(viennacl::traits::stride1(C)),             cl_uint(viennacl::traits::stride2(C)),
-                                        cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
-                                        cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
-                                )
-                              );        
-      }
-      
-      // C = A * B, using fast kernel
-      template <typename T1, typename T2, typename T3 >
-      void prod_fast_kernel(const T1 & A, 
-                            const T2 & B, 
-                            T3 & C,
-                            std::string kernel_name)
-      {
-        typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
-        
-        typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
-        KernelClass::init();
-        
-        //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        
-        k.global_work_size(0, viennacl::traits::size2(C) / 4); //column blocks
-        k.global_work_size(1, viennacl::traits::size1(C) / 4); //row blocks
-        k.local_work_size(0, 16);  //columns
-        k.local_work_size(1, 4);   //rows
-        
-        cpu_value_type alpha(1);
-        cpu_value_type beta(0);
-        
-        viennacl::ocl::enqueue(k(alpha,
-                                 viennacl::traits::handle(A), 
-                                        cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)), 
-                                        cl_uint(viennacl::traits::stride1(A)),             cl_uint(viennacl::traits::stride2(A)),
-                                        cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
-                                        cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
-                                 viennacl::traits::handle(B), 
-                                        cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)), 
-                                        cl_uint(viennacl::traits::stride1(B)),             cl_uint(viennacl::traits::stride2(B)),
-                                        cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
-                                        cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
-                                 beta,
-                                 viennacl::traits::handle(C), 
-                                        cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)), 
-                                        cl_uint(viennacl::traits::stride1(C)),             cl_uint(viennacl::traits::stride2(C)),
-                                        cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
-                                        cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
-                                )
-                              );        
-      }
-      
-      template <typename T1, typename T2, typename T3 >
-      void prod(const T1 & A, 
-                const T2 & B, 
-                T3 & C,
-                std::string fast_kernel_name,
-                std::string slow_kernel_name)
-      {
-        if (   (viennacl::traits::size1(A) < 64)
-            || (viennacl::traits::size2(A) < 64)
-            || (viennacl::traits::size1(B) < 64) )   //there is most likely not enough to compute, rendering kernel launch overhead considerable
-        {
-          prod_slow_kernel(A, B, C, slow_kernel_name);
-        }
-        else if (   (viennacl::traits::size1(A) % 64 == 0)
-                 && (viennacl::traits::size2(A) % 64 == 0)
-                 && (viennacl::traits::size1(B) % 64 == 0) )   // allows the use of the fast kernel only
-        {
-          prod_fast_kernel(A, B, C, fast_kernel_name);
-          //prod_slow_kernel(A, B, C, slow_kernel_name);
-        }
-        else //TODO: use four kernels
-        {
-          prod_slow_kernel(A, B, C, slow_kernel_name);
-        }
-        
-      }
-    }
-
-=======
->>>>>>> upstream/1.5.1
 
     /** @brief Carries out matrix-matrix multiplication
     *
     * Implementation of C = prod(A, B);
     *
     */
-<<<<<<< HEAD
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const T1 & A, 
-              const T2 & B, 
-                    T3 & C)
-    {
-      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size2(A) == viennacl::traits::size1(B));
-      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-
-      
-      detail::prod(A, B, C, "prod16_AA", "prod_AA");
-=======
     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
     void prod_impl(const matrix_base<NumericT, F1> & A,
                    const matrix_base<NumericT, F2> & B,
@@ -841,7 +459,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -851,30 +468,6 @@ namespace viennacl
     * Implementation of C = prod(trans(A), B);
     *
     */
-<<<<<<< HEAD
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const viennacl::matrix_expression< const T1,
-                                                 const T1,
-                                                 op_trans> & A, 
-              const T2 & B, 
-                    T3 & C)
-    {
-      //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
-      //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
-      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B));
-      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      detail::prod(A.lhs(), B, C, "prod16_TA", "prod_TA");
-=======
     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
     void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
                                                       const matrix_base<NumericT, F1>,
@@ -908,7 +501,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -919,28 +511,6 @@ namespace viennacl
     * Implementation of C = prod(A, trans(B));
     *
     */
-<<<<<<< HEAD
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const T1 & A, 
-              const viennacl::matrix_expression< const T2,
-                                                 const T2,
-                                                 op_trans> & B,
-              T3 & C)
-    {
-      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs()));
-      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      detail::prod(A, B.lhs(), C, "prod16_AT", "prod_AT");
-=======
     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
     void prod_impl(const matrix_base<NumericT, F1> & A,
                    const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
@@ -972,7 +542,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -982,30 +551,6 @@ namespace viennacl
     * Implementation of C = prod(trans(A), trans(B));
     *
     */
-<<<<<<< HEAD
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const viennacl::matrix_expression< const T1,
-                                                 const T1,
-                                                 op_trans> & A,
-              const viennacl::matrix_expression< const T2,
-                                                 const T2,
-                                                 op_trans> & B,
-              T3 & C)
-    {
-      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()));
-      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      detail::prod(A.lhs(), B.lhs(), C, "prod16_TT", "prod_TT");
-=======
     template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
     void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
                    const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
@@ -1037,7 +582,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -1244,105 +788,6 @@ namespace viennacl
   //
 
 
-<<<<<<< HEAD
-
-
-
-  //v = A * x
-  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-  *
-  * @param proxy  An expression template proxy class.
-  */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                        const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                        viennacl::op_prod> & proxy) 
-  {
-    // check for the special case x = A * x
-    if (proxy.rhs().handle().get() == this->handle().get())
-    {
-      viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this = result;
-    }
-    else
-    {
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-    }
-    return *this;
-  }
-
-
-  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix and v1, v2 are vector ranges
-  *
-  * @param proxy  An expression template proxy class.
-  */
-  template <typename VectorType>
-  template <typename MatrixType>
-  typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value,
-                                viennacl::vector_range<VectorType> & >::type
-  viennacl::vector_range<VectorType>::operator=(const vector_expression< const MatrixType,
-                                                                         const viennacl::vector_range<VectorType>,
-                                                                         op_prod> & proxy)
-  {
-    typedef typename viennacl::result_of::cpu_value_type<VectorType>::type   cpu_value_type;
-    
-    
-    // check for the special case x = A * x
-    if (proxy.rhs().get().handle().get() == this->get().handle().get())
-    {
-      viennacl::vector<cpu_value_type> result(proxy.rhs().size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this = result;
-    }
-    else
-    {
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-    }
-    return *this;
-  }
-
-
-  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix and v1, v2 are vector slices
-  *
-  * @param proxy  An expression template proxy class.
-  */
-  template <typename VectorType>
-  template <typename MatrixType>
-  typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value,
-                                viennacl::vector_slice<VectorType> & >::type
-  viennacl::vector_slice<VectorType>::operator=(const vector_expression< const MatrixType,
-                                                                         const viennacl::vector_slice<VectorType>,
-                                                                         op_prod> & proxy)
-  {
-    typedef typename viennacl::result_of::cpu_value_type<VectorType>::type   cpu_value_type;
-    
-    
-    // check for the special case x = A * x
-    if (proxy.rhs().get().handle().get() == this->get().handle().get())
-    {
-      viennacl::vector<cpu_value_type> result(proxy.rhs().size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this = result;
-    }
-    else
-    {
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-    }
-    return *this;
-  }
-
-
-
-
-
-
-
-
-=======
->>>>>>> upstream/1.5.1
   //v += A * x
   /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
   *
diff --git a/viennacl/linalg/nmf.hpp b/viennacl/linalg/nmf.hpp
index d833728..e47712d 100644
--- a/viennacl/linalg/nmf.hpp
+++ b/viennacl/linalg/nmf.hpp
@@ -2,40 +2,25 @@
 #define VIENNACL_LINALG_NMF_HPP
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/nmf.hpp
-<<<<<<< HEAD
-    @brief Provides a nonnegative matrix factorization implementation.  Experimental in 1.3.x.
-    
-=======
     @brief Provides a nonnegative matrix factorization implementation.  Experimental.
 
->>>>>>> upstream/1.5.1
     Contributed by Volodymyr Kysenko.
 */
 
@@ -44,110 +29,13 @@
 #include "viennacl/matrix.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/norm_2.hpp"
-<<<<<<< HEAD
-#include "viennacl/linalg/kernels/nmf_kernels.h"
-=======
 #include "viennacl/linalg/norm_frobenius.hpp"
 #include "viennacl/linalg/opencl/kernels/nmf.hpp"
->>>>>>> upstream/1.5.1
 
 namespace viennacl
 {
   namespace linalg
   {
-<<<<<<< HEAD
-    //const std::string NMF_PROGRAM_NAME = "elem_wise_ops";
-    const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
-    const std::string NMF_SUB_KERNEL = "sub_wise";
-
-
-    template <typename ScalarType>
-    void nmf(viennacl::matrix<ScalarType> const & v,
-             viennacl::matrix<ScalarType> & w,
-             viennacl::matrix<ScalarType> & h,
-             std::size_t k,
-             ScalarType eps = 0.000001,
-             std::size_t max_iter = 10000,
-             std::size_t check_diff_every_step = 100)
-    {
-      viennacl::linalg::kernels::nmf<ScalarType, 1>::init();
-      
-      w.resize(v.size1(), k);
-      h.resize(k, v.size2());
-
-      std::vector<ScalarType> stl_w(w.internal_size1() * w.internal_size2());
-      std::vector<ScalarType> stl_h(h.internal_size1() * h.internal_size2());
-
-      for (std::size_t j = 0; j < stl_w.size(); j++)
-          stl_w[j] = static_cast<ScalarType>(rand()) / RAND_MAX;
-
-      for (std::size_t j = 0; j < stl_h.size(); j++)
-          stl_h[j] = static_cast<ScalarType>(rand()) / RAND_MAX;
-
-      viennacl::matrix<ScalarType> wn(v.size1(), k);
-      viennacl::matrix<ScalarType> wd(v.size1(), k);
-      viennacl::matrix<ScalarType> wtmp(v.size1(), v.size2());
-
-      viennacl::matrix<ScalarType> hn(k, v.size2());
-      viennacl::matrix<ScalarType> hd(k, v.size2());
-      viennacl::matrix<ScalarType> htmp(k, k);
-
-      viennacl::matrix<ScalarType> appr(v.size1(), v.size2());
-      viennacl::vector<ScalarType> diff(v.size1() * v.size2());
-
-      viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w);
-      viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h);
-
-      ScalarType last_diff = 0.0f;
-
-
-      
-      for (std::size_t i = 0; i < max_iter; i++)
-      {
-        {
-          hn = viennacl::linalg::prod(trans(w), v);
-          htmp = viennacl::linalg::prod(trans(w), w);
-          hd = viennacl::linalg::prod(htmp, h);
-
-          viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
-                                                                             NMF_MUL_DIV_KERNEL);
-          viennacl::ocl::enqueue(mul_div_kernel(h, hn, hd, cl_uint(stl_h.size())));
-        }
-        {
-          wn = viennacl::linalg::prod(v, trans(h));
-          wtmp = viennacl::linalg::prod(w, h);
-          wd = viennacl::linalg::prod(wtmp, trans(h));
-
-          viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
-                                                                             NMF_MUL_DIV_KERNEL);
-          
-          viennacl::ocl::enqueue(mul_div_kernel(w, wn, wd, cl_uint(stl_w.size())));
-        }
-
-        if (i % check_diff_every_step == 0)
-        {
-          appr = viennacl::linalg::prod(w, h);
-
-         viennacl::ocl::kernel & sub_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
-                                                                        NMF_SUB_KERNEL);
-          //this is a cheat. i.e save difference of two matrix into vector to get norm_2
-          viennacl::ocl::enqueue(sub_kernel(appr, v, diff, cl_uint(v.size1() * v.size2())));
-          ScalarType diff_val = viennacl::linalg::norm_2(diff);
-
-          if((diff_val < eps) || (fabs(diff_val - last_diff) < eps))
-          {
-              //std::cout << "Breaked at diff - " << diff_val << "\n";
-              break;
-          }
-
-          last_diff = diff_val;
-
-          //printf("Iteration #%lu - %.5f \n", i, diff_val);
-        }
-      }
-      
-      
-=======
     /** @brief Configuration class for the nonnegative-matrix-factorization algorithm. Specify tolerances, maximum iteration counts, etc., here. */
     class nmf_config
     {
@@ -305,13 +193,8 @@ namespace viennacl
       }
 
 
->>>>>>> upstream/1.5.1
     }
   }
 }
 
-<<<<<<< HEAD
-#endif
-=======
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/linalg/norm_1.hpp b/viennacl/linalg/norm_1.hpp
index 6487a33..42c6e02 100644
--- a/viennacl/linalg/norm_1.hpp
+++ b/viennacl/linalg/norm_1.hpp
@@ -82,29 +82,6 @@ namespace viennacl
                                           viennacl::op_norm_1 >(vector, vector);
     }
 
-<<<<<<< HEAD
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                 const viennacl::vector_range<VectorType>,
-                                 viennacl::op_norm_1 >
-    norm_1(viennacl::vector_range<VectorType> const & vector)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                          const viennacl::vector_range<VectorType>,
-                                          viennacl::op_norm_1 >(vector, vector);
-    }
-
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                 const viennacl::vector_slice<VectorType>,
-                                 viennacl::op_norm_1 >
-    norm_1(viennacl::vector_slice<VectorType> const & vector)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                          const viennacl::vector_slice<VectorType>,
-                                          viennacl::op_norm_1 >(vector, vector);
-    }
-=======
     // with vector expression:
     template <typename LHS, typename RHS, typename OP>
     viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
@@ -124,7 +101,6 @@ namespace viennacl
     {
       return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_1>(A, A);
     }*/
->>>>>>> upstream/1.5.1
 
   } // end namespace linalg
 } // end namespace viennacl
diff --git a/viennacl/linalg/norm_2.hpp b/viennacl/linalg/norm_2.hpp
index 20d21c3..e716ce3 100644
--- a/viennacl/linalg/norm_2.hpp
+++ b/viennacl/linalg/norm_2.hpp
@@ -107,30 +107,6 @@ namespace viennacl
                                           viennacl::op_norm_2 >(v, v);
     }
 
-<<<<<<< HEAD
-
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                 const viennacl::vector_range<VectorType>,
-                                 viennacl::op_norm_2 >
-    norm_2(viennacl::vector_range<VectorType> const & vector)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                          const viennacl::vector_range<VectorType>,
-                                          viennacl::op_norm_2 >(vector, vector);
-    }
-
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                 const viennacl::vector_slice<VectorType>,
-                                 viennacl::op_norm_2 >
-    norm_2(viennacl::vector_slice<VectorType> const & vector)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                          const viennacl::vector_slice<VectorType>,
-                                          viennacl::op_norm_2 >(vector, vector);
-    }
-=======
     // with vector expression:
     template <typename LHS, typename RHS, typename OP>
     viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
@@ -143,7 +119,6 @@ namespace viennacl
                                           viennacl::op_norm_2>(vector, vector);
     }
 
->>>>>>> upstream/1.5.1
 
   } // end namespace linalg
 } // end namespace viennacl
diff --git a/viennacl/linalg/norm_inf.hpp b/viennacl/linalg/norm_inf.hpp
index ab5681c..b8d15eb 100644
--- a/viennacl/linalg/norm_inf.hpp
+++ b/viennacl/linalg/norm_inf.hpp
@@ -85,29 +85,6 @@ namespace viennacl
                                           viennacl::op_norm_inf >(v1, v1);
     }
 
-<<<<<<< HEAD
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                 const viennacl::vector_range<VectorType>,
-                                 viennacl::op_norm_inf >
-    norm_inf(viennacl::vector_range<VectorType> const & vector)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_range<VectorType>, 
-                                          const viennacl::vector_range<VectorType>,
-                                          viennacl::op_norm_inf >(vector, vector);
-    }
-
-    template< typename VectorType >
-    viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                 const viennacl::vector_slice<VectorType>,
-                                 viennacl::op_norm_inf >
-    norm_inf(viennacl::vector_slice<VectorType> const & vector)
-    {
-      return viennacl::scalar_expression< const viennacl::vector_slice<VectorType>, 
-                                          const viennacl::vector_slice<VectorType>,
-                                          viennacl::op_norm_inf >(vector, vector);
-    }
-=======
     // with vector expression:
     template <typename LHS, typename RHS, typename OP>
     viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
@@ -129,7 +106,6 @@ namespace viennacl
       return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_inf>(A, A);
     }*/
 
->>>>>>> upstream/1.5.1
 
   } // end namespace linalg
 } // end namespace viennacl
diff --git a/viennacl/linalg/power_iter.hpp b/viennacl/linalg/power_iter.hpp
index bfe6026..75ee20d 100644
--- a/viennacl/linalg/power_iter.hpp
+++ b/viennacl/linalg/power_iter.hpp
@@ -2,41 +2,25 @@
 #define VIENNACL_LINALG_POWER_ITER_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-<<<<<<< HEAD
-/** @file power_iter.hpp
-    @brief Defines a tag for the configuration of the power iteration method.
-    
-=======
 /** @file viennacl/linalg/power_iter.hpp
     @brief Defines a tag for the configuration of the power iteration method.
 
->>>>>>> upstream/1.5.1
     Contributed by Astrid Rupp.
 */
 
@@ -48,15 +32,6 @@
 
 namespace viennacl
 {
-<<<<<<< HEAD
-  namespace linalg 
-  {
-    /** @brief A tag for the power iteration algorithm. */
-    class power_iter_tag 
-    {
-      public:
-        
-=======
   namespace linalg
   {
     /** @brief A tag for the power iteration algorithm. */
@@ -64,40 +39,18 @@ namespace viennacl
     {
       public:
 
->>>>>>> upstream/1.5.1
         /** @brief The constructor
         *
         * @param tfac      If the eigenvalue does not change more than this termination factor, the algorithm stops
         * @param max_iters Maximum number of iterations for the power iteration
         */
-<<<<<<< HEAD
-        power_iter_tag(double tfac = 1e-8, std::size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
-=======
         power_iter_tag(double tfac = 1e-8, vcl_size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
->>>>>>> upstream/1.5.1
 
         /** @brief Sets the factor for termination */
         void factor(double fct){ termination_factor_ = fct; }
 
           /** @brief Returns the factor for termination */
         double factor() const { return termination_factor_; }
-<<<<<<< HEAD
-        
-        std::size_t max_iterations() const { return max_iterations_; }
-        void max_iterations(std::size_t new_max) { max_iterations_ = new_max; }
-
-      private: 
-        double termination_factor_;
-        std::size_t max_iterations_;
-
-    };
-  
-   /** 
-    *   @brief Implementation of the calculation of eigenvalues using poweriteration
-    *   
-    *   @param matrix        The system matrix
-    *   @param tag           Tag with termination factor 
-=======
 
         vcl_size_t max_iterations() const { return max_iterations_; }
         void max_iterations(vcl_size_t new_max) { max_iterations_ = new_max; }
@@ -113,7 +66,6 @@ namespace viennacl
     *
     *   @param matrix        The system matrix
     *   @param tag           Tag with termination factor
->>>>>>> upstream/1.5.1
     *   @return              Returns the largest eigenvalue computed by the power iteration method
     */
     template< typename MatrixT >
@@ -124,15 +76,6 @@ namespace viennacl
       typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
       typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
-<<<<<<< HEAD
-    
-      CPU_ScalarType eigenvalue;
-      long matrix_size = matrix.size1();
-      VectorT r(matrix_size);
-      std::vector<CPU_ScalarType> s(matrix_size);
-      
-      for(std::size_t i=0; i<s.size(); ++i)
-=======
 
       CPU_ScalarType eigenvalue;
       vcl_size_t matrix_size = matrix.size1();
@@ -141,31 +84,17 @@ namespace viennacl
       std::vector<CPU_ScalarType> s(matrix_size);
 
       for(vcl_size_t i=0; i<s.size(); ++i)
->>>>>>> upstream/1.5.1
         s[i] = (i % 3) * CPU_ScalarType(0.1234) - CPU_ScalarType(0.5);   //'random' starting vector
 
       detail::copy_vec_to_vec(s,r);
 
       //std::cout << s << std::endl;
-<<<<<<< HEAD
-      
-=======
 
->>>>>>> upstream/1.5.1
       double epsilon = tag.factor();
       CPU_ScalarType norm = norm_2(r);
       CPU_ScalarType norm_prev = 0;
       long numiter = 0;
 
-<<<<<<< HEAD
-      for (std::size_t i=0; i<tag.max_iterations(); ++i)
-      {
-        if (std::abs<CPU_ScalarType>(norm - norm_prev) / std::abs<CPU_ScalarType>(norm) < epsilon)
-          break; 
-           
-        r /= norm;
-        r = viennacl::linalg::prod(matrix, r);
-=======
       for (vcl_size_t i=0; i<tag.max_iterations(); ++i)
       {
         if (std::fabs(norm - norm_prev) / std::fabs(norm) < epsilon)
@@ -174,7 +103,6 @@ namespace viennacl
         r /= norm;
         r2 = viennacl::linalg::prod(matrix, r);  //using helper vector r2 for the computation of r <- A * r in order to avoid the repeated creation of temporaries
         r = r2;
->>>>>>> upstream/1.5.1
         norm_prev = norm;
         norm = norm_2(r);
         numiter++;
@@ -187,8 +115,4 @@ namespace viennacl
 
   } // end namespace linalg
 } // end namespace viennacl
-<<<<<<< HEAD
-#endif
-=======
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/linalg/prod.hpp b/viennacl/linalg/prod.hpp
index 9281018..702fc8f 100644
--- a/viennacl/linalg/prod.hpp
+++ b/viennacl/linalg/prod.hpp
@@ -148,32 +148,11 @@ namespace viennacl
                                           viennacl::op_mat_mat_prod >(A, B);
     }
 
-<<<<<<< HEAD
-    template< typename MatrixT1, typename MatrixT2 >
-    viennacl::matrix_expression< const MatrixT1, 
-                                 const viennacl::matrix_slice<MatrixT2>,
-                                 viennacl::op_prod >
-    prod(MatrixT1 const& A,
-         viennacl::matrix_slice<MatrixT2> const& B)
-    {
-      // std::cout << "viennacl .. " << std::endl;
-      return viennacl::matrix_expression< const MatrixT1, 
-                                          const viennacl::matrix_slice<MatrixT2>,
-                                          viennacl::op_prod >(A, B);
-    }
-
-
-    template< typename MatrixT1, typename MatrixT2 >
-    viennacl::matrix_expression< const MatrixT1, 
-                                 const viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
-                                                                   const viennacl::matrix_range<MatrixT2>,
-=======
     // right factor is transposed:
     template< typename NumericT, typename F1, typename F2>
     viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
                                  const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
                                                                    const viennacl::matrix_base<NumericT, F2>,
->>>>>>> upstream/1.5.1
                                                                    op_trans>,
                                  viennacl::op_mat_mat_prod >
     prod(viennacl::matrix_base<NumericT, F1> const & A,
@@ -189,25 +168,6 @@ namespace viennacl
                                           viennacl::op_mat_mat_prod >(A, B);
     }
 
-<<<<<<< HEAD
-    template< typename MatrixT1, typename MatrixT2 >
-    viennacl::matrix_expression< const MatrixT1, 
-                                 const viennacl::matrix_expression<const viennacl::matrix_slice<MatrixT2>,
-                                                                   const viennacl::matrix_slice<MatrixT2>,
-                                                                   op_trans>,
-                                 viennacl::op_prod >
-    prod(MatrixT1 const & A,
-         viennacl::matrix_expression<const viennacl::matrix_slice<MatrixT2>,
-                                     const viennacl::matrix_slice<MatrixT2>,
-                                     op_trans> const & B)
-    {
-      // std::cout << "viennacl .. " << std::endl;
-      return viennacl::matrix_expression< const MatrixT1, 
-                                          const viennacl::matrix_expression<const viennacl::matrix_slice<MatrixT2>,
-                                                                            const viennacl::matrix_slice<MatrixT2>,
-                                                                            op_trans>,
-                                          viennacl::op_prod >(A, B);
-=======
     // left factor transposed:
     template< typename NumericT, typename F1, typename F2>
     viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
@@ -226,7 +186,6 @@ namespace viennacl
                                                                             op_trans>,
                                           const viennacl::matrix_base<NumericT, F2>,
                                           viennacl::op_mat_mat_prod >(A, B);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -263,13 +222,8 @@ namespace viennacl
     viennacl::vector_expression< const viennacl::matrix_base<NumericT, F>,
                                  const viennacl::vector_base<NumericT>,
                                  viennacl::op_prod >
-<<<<<<< HEAD
-    prod(MatrixT const& matrix,
-         viennacl::vector<NumericT, ALIGNMENT> const & vector)
-=======
     prod(viennacl::matrix_base<NumericT, F> const & matrix,
          viennacl::vector_base<NumericT> const & vector)
->>>>>>> upstream/1.5.1
     {
       // std::cout << "viennacl .. " << std::endl;
       return viennacl::vector_expression< const viennacl::matrix_base<NumericT, F>,
@@ -277,47 +231,12 @@ namespace viennacl
                                           viennacl::op_prod >(matrix, vector);
     }
 
-<<<<<<< HEAD
-    template< typename MatrixT, typename VectorType >
-    viennacl::vector_expression< const MatrixT, 
-                                 const viennacl::vector_range<VectorType>,
-                                 viennacl::op_prod >
-    prod(MatrixT const& matrix,
-         viennacl::vector_range<VectorType> const & vector)
-    {
-      // std::cout << "viennacl .. " << std::endl;
-      return viennacl::vector_expression< const MatrixT, 
-                                          const viennacl::vector_range<VectorType>,
-                                          viennacl::op_prod >(matrix, vector);
-    }
-
-    template< typename MatrixT, typename VectorType >
-    viennacl::vector_expression< const MatrixT, 
-                                 const viennacl::vector_slice<VectorType>,
-                                 viennacl::op_prod >
-    prod(MatrixT const& matrix,
-         viennacl::vector_slice<VectorType> const & vector)
-    {
-      // std::cout << "viennacl .. " << std::endl;
-      return viennacl::vector_expression< const MatrixT, 
-                                          const viennacl::vector_slice<VectorType>,
-                                          viennacl::op_prod >(matrix, vector);
-    }
-
-
-
-
-    template< typename MatrixT, typename NumericT, typename F, unsigned int ALIGNMENT >
-    viennacl::matrix_expression< const MatrixT, 
-                                 const viennacl::matrix<NumericT, F, ALIGNMENT>,
-=======
     // transposed matrix-vector product
     template< typename NumericT, typename F>
     viennacl::vector_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
                                                                    const viennacl::matrix_base<NumericT, F>,
                                                                    op_trans>,
                                  const viennacl::vector_base<NumericT>,
->>>>>>> upstream/1.5.1
                                  viennacl::op_prod >
     prod(viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
                                      const viennacl::matrix_base<NumericT, F>,
diff --git a/viennacl/linalg/qr.hpp b/viennacl/linalg/qr.hpp
index 7935eaa..34b63ca 100644
--- a/viennacl/linalg/qr.hpp
+++ b/viennacl/linalg/qr.hpp
@@ -217,50 +217,14 @@ namespace viennacl
         project( A, range(j+1, A.size1()), range(j, j+1) ) = project(v, range(j+1, A.size1()), range(0, 1) );;
       }
 
-<<<<<<< HEAD
-      
-
-      /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
-      * 
-=======
 
 
       /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A
       *
->>>>>>> upstream/1.5.1
       * @param A            A dense compatible to Boost.uBLAS
       * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
       */
       template<typename MatrixType>
-<<<<<<< HEAD
-      std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, std::size_t block_size = 32)
-      {
-        typedef typename MatrixType::value_type   ScalarType;
-        typedef boost::numeric::ublas::matrix_range<MatrixType>  MatrixRange;
-        
-        using boost::numeric::ublas::range;
-        using boost::numeric::ublas::project;
-        
-        std::vector<ScalarType> betas(A.size2());
-        //boost::numeric::ublas::vector<ScalarType> v(A.size1());
-        MatrixType v(A.size1(), 1);
-        MatrixType matrix_1x1(1,1);
-
-        MatrixType Y(A.size1(), block_size); Y.clear(); Y.resize(A.size1(), block_size);
-        MatrixType W(A.size1(), block_size); W.clear(); W.resize(A.size1(), block_size);
-          
-        //run over A in a block-wise manner:
-        for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-        {
-          std::size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
-          
-          //determine Householder vectors:
-          for (std::size_t k = 0; k < effective_block_size; ++k)
-          {
-            betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
-            
-            for (std::size_t l = k; l < effective_block_size; ++l)
-=======
       std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, vcl_size_t block_size = 32)
       {
         typedef typename MatrixType::value_type   ScalarType;
@@ -287,7 +251,6 @@ namespace viennacl
             betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
 
             for (vcl_size_t l = k; l < effective_block_size; ++l)
->>>>>>> upstream/1.5.1
               detail::householder_reflect_ublas(A, v, matrix_1x1, betas[j+k], j+k, j+l);
 
             detail::write_householder_to_A_ublas(A, v, j+k);
@@ -297,102 +260,6 @@ namespace viennacl
           // Setup Y:
           //
           Y.clear();  Y.resize(A.size1(), block_size);
-<<<<<<< HEAD
-          for (std::size_t k = 0; k < effective_block_size; ++k)
-          {
-            //write Householder to Y:
-            Y(j+k,k) = 1.0;
-            project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
-          }
-          
-          //
-          // Setup W:
-          //
-          
-          //first vector:
-          W.clear();  W.resize(A.size1(), block_size);
-          W(j, 0) = -betas[j];
-          project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
-          
-          
-          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-          for (std::size_t k = 1; k < effective_block_size; ++k)
-          {
-            MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
-            MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
-            MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
-            MatrixRange z     = project(W, range(j, A.size1()), range(k, k+1));
-            
-            MatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
-            z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
-          }
-
-          //
-          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-          //
-          
-          if (A.size2() - j - effective_block_size > 0)
-          {
-            
-            MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
-            MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
-            MatrixType temp = boost::numeric::ublas::prod(trans(W_part), A_part);
-            
-            A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
-                          temp);
-          }
-        }
-
-        return betas;
-      }
-
-
-      /** @brief Implementation of a OpenCL-only QR factorization for GPUs (or multi-core CPU). DEPRECATED! Use only if you're curious and interested in playing a bit with a GPU-only implementation.
-      * 
-      * Performance is rather poor at small matrix sizes.
-      * Prefer the use of the hybrid version, which is automatically chosen using the interface function inplace_qr()
-      * 
-      * @param A            A dense ViennaCL matrix to be factored
-      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
-      */
-      template<typename MatrixType>
-      std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
-      inplace_qr_viennacl(MatrixType & A, std::size_t block_size = 16)
-      {
-        typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
-        typedef viennacl::matrix_range<MatrixType>  MatrixRange;
-        
-        //using boost::numeric::ublas::range;
-        //using boost::numeric::ublas::project;
-        using viennacl::range;
-        using viennacl::project;
-        
-        std::vector<ScalarType> betas(A.size2());
-        //boost::numeric::ublas::vector<ScalarType> v(A.size1());
-        MatrixType v(A.size1(), 1);
-        MatrixType matrix_1x1(1,1);
-
-        MatrixType Y(A.size1(), block_size); Y.clear();
-        MatrixType W(A.size1(), block_size); W.clear();
-
-        MatrixType YT_prod_v(block_size, 1);
-        MatrixType z(A.size1(), 1);      
-        
-        //run over A in a block-wise manner:
-        for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-        {
-          std::size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
-          
-          //determine Householder vectors:
-          for (std::size_t k = 0; k < effective_block_size; ++k)
-          {
-            betas[j+k] = detail::setup_householder_vector_viennacl(A, v, matrix_1x1, j+k);
-            for (std::size_t l = k; l < effective_block_size; ++l)
-              detail::householder_reflect_viennacl(A, v, matrix_1x1, betas[j+k], j+k, j+l);
-
-            detail::write_householder_to_A_viennacl(A, v, j+k);
-          }
-=======
           for (vcl_size_t k = 0; k < effective_block_size; ++k)
           {
             //write Householder to Y:
@@ -484,74 +351,28 @@ namespace viennacl
 
             detail::write_householder_to_A_viennacl(A, v, j+k);
           }
->>>>>>> upstream/1.5.1
 
           //
           // Setup Y:
           //
           Y.clear();
-<<<<<<< HEAD
-          for (std::size_t k = 0; k < effective_block_size; ++k)
-=======
           for (vcl_size_t k = 0; k < effective_block_size; ++k)
->>>>>>> upstream/1.5.1
           {
             //write Householder to Y:
             Y(j+k,k) = 1.0;
             project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
           }
-<<<<<<< HEAD
-          
-          //
-          // Setup W:
-          //
-          
-=======
 
           //
           // Setup W:
           //
 
->>>>>>> upstream/1.5.1
           //first vector:
           W.clear();
           W(j, 0) = -betas[j];
           //project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
           project(W, range(j+1, A.size1()), range(0, 1)) = project(A, range(j+1, A.size1()), range(j, j+1));
           project(W, range(j+1, A.size1()), range(0, 1)) *= -betas[j];
-<<<<<<< HEAD
-          
-          
-          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-          for (std::size_t k = 1; k < effective_block_size; ++k)
-          {
-            MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
-            MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
-            MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
-            //MatrixRange z     = project(W, range(0, A.size1()), range(k, k+1));
-          
-            //std::cout << "should: " << k << std::endl;
-            project(YT_prod_v, range(0, k), range(0,1)) = prod(trans(Y_old), v_k);
-            project(z, range(j, A.size1()), range(0,1)) = prod(W_old, project(YT_prod_v, range(0, k), range(0,1)));
-            //project(W, range(0, A.size1()), range(k, k+1)) = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
-            project(W, range(j, A.size1()), range(k, k+1)) = project(z, range(j, A.size1()), range(0,1));
-            project(W, range(j, A.size1()), range(k, k+1)) += v_k;
-            project(W, range(j, A.size1()), range(k, k+1)) *= - betas[j+k];
-          }
-
-          //
-          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-          //
-          
-          if (A.size2() - j - effective_block_size > 0)
-          {
-            
-            MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
-            MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
-            MatrixType temp = prod(trans(W_part), A_part);
-            
-            A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
-=======
 
 
           //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
@@ -580,7 +401,6 @@ namespace viennacl
             MatrixType temp = prod(trans(W_part), A_part);
 
             A_part += prod(project(Y, range(j, A.size1()), range(0, effective_block_size)),
->>>>>>> upstream/1.5.1
                           temp);
           }
         }
@@ -594,61 +414,29 @@ namespace viennacl
 
 
       //MatrixType is ViennaCL-matrix
-<<<<<<< HEAD
-      /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU) 
-      * 
-      * Prefer the use of the convenience interface inplace_qr()
-      * 
-=======
       /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU)
       *
       * Prefer the use of the convenience interface inplace_qr()
       *
->>>>>>> upstream/1.5.1
       * @param A            A dense ViennaCL matrix to be factored
       * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
       */
       template<typename MatrixType>
-<<<<<<< HEAD
-      std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
-      inplace_qr_hybrid(MatrixType & A, std::size_t block_size = 16)
-=======
       std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type >
       inplace_qr_hybrid(MatrixType & A, vcl_size_t block_size = 16)
->>>>>>> upstream/1.5.1
       {
         typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
 
         typedef viennacl::matrix_range<MatrixType>                    VCLMatrixRange;
         typedef boost::numeric::ublas::matrix<ScalarType>             UblasMatrixType;
         typedef boost::numeric::ublas::matrix_range<UblasMatrixType>  UblasMatrixRange;
-<<<<<<< HEAD
-        
-        //using boost::numeric::ublas::range;
-        //using boost::numeric::ublas::project;
-        
-=======
-
->>>>>>> upstream/1.5.1
+
         std::vector<ScalarType> betas(A.size2());
         UblasMatrixType v(A.size1(), 1);
         UblasMatrixType matrix_1x1(1,1);
 
         UblasMatrixType ublasW(A.size1(), block_size); ublasW.clear(); ublasW.resize(A.size1(), block_size);
         UblasMatrixType ublasY(A.size1(), block_size); ublasY.clear(); ublasY.resize(A.size1(), block_size);
-<<<<<<< HEAD
-        
-        UblasMatrixType ublasA(A.size1(), A.size1());
-        
-        MatrixType vclW(ublasW.size1(), ublasW.size2());
-        MatrixType vclY(ublasY.size1(), ublasY.size2());
-        
-          
-        //run over A in a block-wise manner:
-        for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-        {
-          std::size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
-=======
 
         UblasMatrixType ublasA(A.size1(), A.size1());
 
@@ -660,7 +448,6 @@ namespace viennacl
         for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
         {
           vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
->>>>>>> upstream/1.5.1
           UblasMatrixRange ublasA_part = boost::numeric::ublas::project(ublasA,
                                                                         boost::numeric::ublas::range(0, A.size1()),
                                                                         boost::numeric::ublas::range(j, j + effective_block_size));
@@ -669,15 +456,6 @@ namespace viennacl
                                           viennacl::range(j, j+effective_block_size)),
                          ublasA_part
                         );
-<<<<<<< HEAD
-          
-          //determine Householder vectors:
-          for (std::size_t k = 0; k < effective_block_size; ++k)
-          {
-            betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
-            
-            for (std::size_t l = k; l < effective_block_size; ++l)
-=======
 
           //determine Householder vectors:
           for (vcl_size_t k = 0; k < effective_block_size; ++k)
@@ -685,7 +463,6 @@ namespace viennacl
             betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
 
             for (vcl_size_t l = k; l < effective_block_size; ++l)
->>>>>>> upstream/1.5.1
               detail::householder_reflect_ublas(ublasA, v, matrix_1x1, betas[j+k], j+k, j+l);
 
             detail::write_householder_to_A_ublas(ublasA, v, j+k);
@@ -695,37 +472,6 @@ namespace viennacl
           // Setup Y:
           //
           ublasY.clear();  ublasY.resize(A.size1(), block_size);
-<<<<<<< HEAD
-          for (std::size_t k = 0; k < effective_block_size; ++k)
-          {
-            //write Householder to Y:
-            ublasY(j+k,k) = 1.0;
-            boost::numeric::ublas::project(ublasY, 
-                                           boost::numeric::ublas::range(j+k+1, A.size1()), 
-                                           boost::numeric::ublas::range(k, k+1)) 
-              = boost::numeric::ublas::project(ublasA, 
-                                               boost::numeric::ublas::range(j+k+1, A.size1()),
-                                               boost::numeric::ublas::range(j+k, j+k+1));
-          }
-          
-          //
-          // Setup W:
-          //
-          
-          //first vector:
-          ublasW.clear();  ublasW.resize(A.size1(), block_size);
-          ublasW(j, 0) = -betas[j];
-          boost::numeric::ublas::project(ublasW, 
-                                        boost::numeric::ublas::range(j+1, A.size1()), 
-                                        boost::numeric::ublas::range(0, 1)) 
-            = -betas[j] * boost::numeric::ublas::project(ublasA, 
-                                                          boost::numeric::ublas::range(j+1, A.size1()), 
-                                                          boost::numeric::ublas::range(j, j+1));
-          
-          
-          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-          for (std::size_t k = 1; k < effective_block_size; ++k)
-=======
           for (vcl_size_t k = 0; k < effective_block_size; ++k)
           {
             //write Householder to Y:
@@ -755,7 +501,6 @@ namespace viennacl
 
           //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
           for (vcl_size_t k = 1; k < effective_block_size; ++k)
->>>>>>> upstream/1.5.1
           {
             UblasMatrixRange Y_old = boost::numeric::ublas::project(ublasY,
                                                                     boost::numeric::ublas::range(j, A.size1()),
@@ -763,20 +508,6 @@ namespace viennacl
             UblasMatrixRange v_k   = boost::numeric::ublas::project(ublasY,
                                                                     boost::numeric::ublas::range(j, A.size1()),
                                                                     boost::numeric::ublas::range(k, k+1));
-<<<<<<< HEAD
-            UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW, 
-                                                                    boost::numeric::ublas::range(j, A.size1()), 
-                                                                    boost::numeric::ublas::range(0, k));
-            UblasMatrixRange z     = boost::numeric::ublas::project(ublasW, 
-                                                                    boost::numeric::ublas::range(j, A.size1()), 
-                                                                    boost::numeric::ublas::range(k, k+1));
-            
-            UblasMatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
-            z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
-          }
-          
-          
-=======
             UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW,
                                                                     boost::numeric::ublas::range(j, A.size1()),
                                                                     boost::numeric::ublas::range(0, k));
@@ -789,45 +520,19 @@ namespace viennacl
           }
 
 
->>>>>>> upstream/1.5.1
 
           //
           //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
           //
-<<<<<<< HEAD
-          
-          VCLMatrixRange A_part = viennacl::project(A,
-                                                    viennacl::range(0, A.size1()),
-                                                    viennacl::range(j, j+effective_block_size));
-          
-=======
 
           VCLMatrixRange A_part = viennacl::project(A,
                                                     viennacl::range(0, A.size1()),
                                                     viennacl::range(j, j+effective_block_size));
 
->>>>>>> upstream/1.5.1
           viennacl::copy(boost::numeric::ublas::project(ublasA,
                                                         boost::numeric::ublas::range(0, A.size1()),
                                                         boost::numeric::ublas::range(j, j+effective_block_size)),
                         A_part);
-<<<<<<< HEAD
-          
-          viennacl::copy(ublasW, vclW);
-          viennacl::copy(ublasY, vclY);
-          
-          if (A.size2() - j - effective_block_size > 0)
-          {
-            
-            VCLMatrixRange A_part(A, viennacl::range(j, A.size1()), viennacl::range(j+effective_block_size, A.size2()));
-            VCLMatrixRange W_part(vclW, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size));
-            MatrixType temp = viennacl::linalg::prod(trans(W_part), A_part);
-            
-            A_part += viennacl::linalg::prod(viennacl::project(vclY, 
-                                             viennacl::range(j, A.size1()), 
-                                             viennacl::range(0, vclY.size2())),
-                          temp);
-=======
 
           viennacl::copy(ublasW, vclW);
           viennacl::copy(ublasY, vclY);
@@ -841,7 +546,6 @@ namespace viennacl
 
             A_part += viennacl::linalg::prod(viennacl::project(vclY, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size)),
                                              temp);
->>>>>>> upstream/1.5.1
           }
         }
 
@@ -855,7 +559,6 @@ namespace viennacl
 
 
 
-
     //takes an inplace QR matrix A and generates Q and R explicitly
     template <typename MatrixType, typename VectorType>
     void recoverQ(MatrixType const & A, VectorType const & betas, MatrixType & Q, MatrixType & R)
@@ -894,10 +597,6 @@ namespace viennacl
       }
     }
 
-<<<<<<< HEAD
-    /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A 
-     * 
-=======
 
     /** @brief Computes Q^T b, where Q is an implicit orthogonal matrix defined via its Householder reflectors stored in A.
      *
@@ -941,19 +640,12 @@ namespace viennacl
 
     /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A
      *
->>>>>>> upstream/1.5.1
      * @param A            A dense ViennaCL matrix to be factored
      * @param block_size   The block size to be used.
      */
     template<typename T, typename F, unsigned int ALIGNMENT>
     std::vector<T> inplace_qr(viennacl::matrix<T, F, ALIGNMENT> & A, vcl_size_t block_size = 16)
     {
-<<<<<<< HEAD
-      if (A.size2() % block_size != 0)
-        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
-      
-=======
->>>>>>> upstream/1.5.1
       return detail::inplace_qr_hybrid(A, block_size);
     }
 
@@ -965,12 +657,6 @@ namespace viennacl
     template<typename MatrixType>
     std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, vcl_size_t block_size = 16)
     {
-<<<<<<< HEAD
-      if (A.size2() % block_size != 0)
-        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
-      
-=======
->>>>>>> upstream/1.5.1
       return detail::inplace_qr_ublas(A, block_size);
     }
 
diff --git a/viennacl/linalg/row_scaling.hpp b/viennacl/linalg/row_scaling.hpp
index 6e93148..8795fb8 100644
--- a/viennacl/linalg/row_scaling.hpp
+++ b/viennacl/linalg/row_scaling.hpp
@@ -184,28 +184,8 @@ namespace viennacl
         template <unsigned int ALIGNMENT>
         void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
         {
-<<<<<<< HEAD
-          assert(viennacl::traits::size1(system_matrix) == viennacl::traits::size(vec));
-          
-          //run kernel:
-          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
-                                                                "diag_precond");
-
-          viennacl::ocl::enqueue(
-             k(viennacl::traits::handle(diag_M_inv),
-                cl_uint(viennacl::traits::start(diag_M_inv)),
-                cl_uint(viennacl::traits::stride(diag_M_inv)),
-                cl_uint(viennacl::traits::size(diag_M_inv)),
-               viennacl::traits::handle(vec),
-                cl_uint(viennacl::traits::start(vec)),
-                cl_uint(viennacl::traits::stride(vec)),
-                cl_uint(viennacl::traits::size(vec)) )
-                                );        
-          
-=======
           assert(viennacl::traits::size(diag_M) == viennacl::traits::size(vec) && bool("Size mismatch"));
           vec = element_div(vec, diag_M);
->>>>>>> upstream/1.5.1
         }
 
       private:
diff --git a/viennacl/linalg/svd.hpp b/viennacl/linalg/svd.hpp
index 405b1ef..3f07411 100644
--- a/viennacl/linalg/svd.hpp
+++ b/viennacl/linalg/svd.hpp
@@ -2,40 +2,25 @@
 #define VIENNACL_LINALG_SVD_HPP
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/svd.hpp
-<<<<<<< HEAD
-    @brief Provides singular value decomposition using a block-based approach.  Experimental in 1.3.x.
-    
-=======
     @brief Provides singular value decomposition using a block-based approach.  Experimental.
 
->>>>>>> upstream/1.5.1
     Contributed by Volodymyr Kysenko.
 */
 
@@ -48,100 +33,6 @@
 #include <cmath>
 
 #include "viennacl/matrix.hpp"
-<<<<<<< HEAD
-#include "viennacl/linalg/kernels/svd_kernels.h"
-
-namespace viennacl 
-{
-  namespace linalg 
-  {
-  
-    //const std::string SVD_KERNELS_FOLDER = "../../non-release/svd-kernels/";
-    //const std::string SVD_BIDIAG_PROGRAM = "bidiag.cl";
-
-    const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
-    const std::string SVD_HOUSEHOLDER_COL_KERNEL = "house_col";
-    const std::string SVD_HOUSEHOLDER_ROW_KERNEL = "house_row";
-    const std::string SVD_COPY_COL_KERNEL = "copy_col";
-    const std::string SVD_COPY_ROW_KERNEL = "copy_row";
-    const std::string SVD_MATRIX_TRANSPOSE_KERNEL = "transpose_inplace";
-    const std::string SVD_INVERSE_SIGNS_KERNEL = "inverse_signs";
-    const std::string SVD_GIVENS_PREV_KERNEL = "givens_prev";
-    
-    namespace detail 
-    {
-      static const float EPS = 0.00001f;
-      static const std::size_t ITER_MAX = 50;
-
-      inline float pythag(float a, float b) 
-      {
-        float absa = std::abs(a);
-        float absb = std::abs(b);
-
-        if(absa > absb) {
-          return absa * sqrt(1.0f + pow(absb / absa, 2));
-        } else {
-          return absb * sqrt(1.0f + pow(absa / absb, 2));
-        }
-      }
-
-      inline float sign(float val) 
-      {
-          return val >= 0.0f ? 1.0f : -1.0f;
-      }
-
-      inline float norm_lcl(std::vector<float>& x, unsigned int size) 
-      {
-        float x_norm = 0.0;
-        for(std::size_t i = 0; i < size; i++) x_norm += std::pow(x[i], 2);
-        x_norm = std::sqrt(x_norm);
-        return x_norm;
-      }
-
-      template <typename T>
-      void normalize(std::vector<T>& x, unsigned int size) 
-      {
-        float x_norm = norm_lcl(x, size);
-        for(std::size_t i = 0; i < size; i++) {
-            x[i] /= x_norm;
-        }
-      }
-
-      template <typename T>
-      void householder_vector(std::vector<T> & v, unsigned int start)
-      {
-        float x_norm = norm_lcl(v, v.size());
-        float alpha = -sign(v[start]) * x_norm;
-        v[start] += alpha;
-        normalize(v, v.size());
-      }
-
-      template <typename MatrixType>
-      void transpose(MatrixType& A)
-      {
-
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      static_cast<cl_uint>(A.internal_size1()),
-                                      static_cast<cl_uint>(A.internal_size2())
-                              ));
-      }
-
-      template<typename MatrixType, typename VectorType>
-      void givens_prev(MatrixType& matrix,
-                        VectorType& tmp1,
-                        VectorType& tmp2,
-                        int n,
-                        int l,
-                        int k
-                      )
-      {
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_GIVENS_PREV_KERNEL);
-
-        kernel.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(matrix), 256));
-=======
 #include "viennacl/linalg/opencl/kernels/svd.hpp"
 #include "viennacl/linalg/qr-method-common.hpp"
 
@@ -169,7 +60,6 @@ namespace viennacl
         viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_GIVENS_PREV_KERNEL);
 
         kernel.global_work_size(0, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size1(matrix), 256));
->>>>>>> upstream/1.5.1
         kernel.local_work_size(0, 256);
 
         viennacl::ocl::enqueue(kernel(
@@ -187,12 +77,6 @@ namespace viennacl
       template<typename MatrixType, typename VectorType>
       void change_signs(MatrixType& matrix, VectorType& signs, int n)
       {
-<<<<<<< HEAD
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_INVERSE_SIGNS_KERNEL);
-
-        kernel.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(matrix), 16));
-        kernel.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(matrix), 16));
-=======
         typedef typename MatrixType::value_type                                   ScalarType;
         typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
 
@@ -201,7 +85,6 @@ namespace viennacl
 
         kernel.global_work_size(0, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size1(matrix), 16));
         kernel.global_work_size(1, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size2(matrix), 16));
->>>>>>> upstream/1.5.1
 
         kernel.local_work_size(0, 16);
         kernel.local_work_size(1, 16);
@@ -214,16 +97,6 @@ namespace viennacl
                               ));
       }
 
-<<<<<<< HEAD
-      template<typename MatrixType>
-      void svd_qr_shift(MatrixType& vcl_u,
-                        MatrixType& vcl_v,
-                        boost::numeric::ublas::vector<float> &q, 
-                        boost::numeric::ublas::vector<float> &e)
-      {
-        int n = q.size();
-        int m = vcl_u.size1();
-=======
       template<typename MatrixType, typename CPU_VectorType>
       void svd_qr_shift(MatrixType & vcl_u,
                         MatrixType & vcl_v,
@@ -235,30 +108,10 @@ namespace viennacl
 
         int n = static_cast<int>(q.size());
         int m = static_cast<int>(vcl_u.size1());
->>>>>>> upstream/1.5.1
 
         detail::transpose(vcl_u);
         detail::transpose(vcl_v);
 
-<<<<<<< HEAD
-        std::vector<float> signs_v(n, 1.0f);
-        std::vector<float> cs1(n), ss1(n), cs2(n), ss2(n);
-        
-        viennacl::vector<float> tmp1(n), tmp2(n);
-
-        bool goto_test_conv = false;
-
-        for (int k = n - 1; k >= 0; k--) {
-          // std::cout << "K = " << k << std::endl;
-
-          std::size_t iter = 0;
-          for (iter = 0; iter < detail::ITER_MAX; iter++) {
-            // test for split
-            int l;
-            for (l = k; l >= 0; l--) {
-              goto_test_conv = false;
-              if (fabs(e[l]) <= detail::EPS) {
-=======
         std::vector<CPU_ScalarType> signs_v(n, 1);
         std::vector<CPU_ScalarType> cs1(n), ss1(n), cs2(n), ss2(n);
 
@@ -280,43 +133,18 @@ namespace viennacl
               goto_test_conv = false;
               if (std::fabs(e[l]) <= detail::EPS)
               {
->>>>>>> upstream/1.5.1
                 // set it
                 goto_test_conv = true;
                 break;
               }
 
-<<<<<<< HEAD
-              if (fabs(q[l - 1]) <= detail::EPS) {
-=======
               if (std::fabs(q[l - 1]) <= detail::EPS)
               {
->>>>>>> upstream/1.5.1
                 // goto
                 break;
               }
             }
 
-<<<<<<< HEAD
-            if (!goto_test_conv) {
-              float c = 0.0;
-              float s = 1.0;
-
-              //int l1 = l - 1;
-              int l2 = k;
-
-              for (int i = l; i <= k; i++) {
-                float f = s * e[i];
-                e[i] = c * e[i];
-
-                if (fabs(f) <= detail::EPS) {
-                  l2 = i - 1;
-                  break;
-                }
-
-                float g = q[i];
-                float h = detail::pythag(f, g);
-=======
             if (!goto_test_conv)
             {
               CPU_ScalarType c = 0.0;
@@ -338,7 +166,6 @@ namespace viennacl
 
                 CPU_ScalarType g = q[i];
                 CPU_ScalarType h = detail::pythag(f, g);
->>>>>>> upstream/1.5.1
                 q[i] = h;
                 c = g / h;
                 s = -f / h;
@@ -349,36 +176,18 @@ namespace viennacl
 
               // std::cout << "Hitted!" << l1 << " " << l2 << "\n";
 
-<<<<<<< HEAD
-              // for(int i = l; i <= l2; i++) 
-              // {
-              //   for (int j = 0; j < m; j++) 
-              //   {
-              //     float y = u(j, l1);
-              //     float z = u(j, i);
-=======
               // for(int i = l; i <= l2; i++)
               // {
               //   for (int j = 0; j < m; j++)
               //   {
               //     CPU_ScalarType y = u(j, l1);
               //     CPU_ScalarType z = u(j, i);
->>>>>>> upstream/1.5.1
               //     u(j, l1) = y * cs1[i] + z * ss1[i];
               //     u(j, i) = -y * ss1[i] + z * cs1[i];
               //   }
               // }
             }
 
-<<<<<<< HEAD
-            float z = q[k];
-
-            if (l == k) {
-              if (z < 0.0f) {
-                q[k] = -z;
-
-                signs_v[k] *= -1.0f;
-=======
             CPU_ScalarType z = q[k];
 
             if (l == k)
@@ -388,25 +197,11 @@ namespace viennacl
                 q[k] = -z;
 
                 signs_v[k] *= -1;
->>>>>>> upstream/1.5.1
               }
 
               break;
             }
 
-<<<<<<< HEAD
-            if (iter >= detail::ITER_MAX - 1) {
-              break;
-            }
-
-            float x = q[l];
-            float y = q[k - 1];
-            float g = e[k - 1];
-            float h = e[k];
-            float f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
-            
-            g = detail::pythag(f, 1.0);
-=======
             if (iter >= detail::ITER_MAX - 1)
               break;
 
@@ -417,7 +212,6 @@ namespace viennacl
             CPU_ScalarType f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2 * h * y);
 
             g = detail::pythag<CPU_ScalarType>(f, 1);
->>>>>>> upstream/1.5.1
 
             if (f < 0) {
               f = ((x - z) * (x + z) + h * (y / (f - g) - h)) / x;
@@ -425,27 +219,16 @@ namespace viennacl
               f = ((x - z) * (x + z) + h * (y / (f + g) - h)) / x;
             }
 
-<<<<<<< HEAD
-            float c = 1.0;
-            float s = 1.0;
-
-            for (std::size_t i = l + 1; i <= static_cast<std::size_t>(k); i++) 
-=======
             CPU_ScalarType c = 1;
             CPU_ScalarType s = 1;
 
             for (vcl_size_t i = l + 1; i <= static_cast<vcl_size_t>(k); i++)
->>>>>>> upstream/1.5.1
             {
               g = e[i];
               y = q[i];
               h = s * g;
               g = c * g;
-<<<<<<< HEAD
-              float z = detail::pythag(f, h);
-=======
               CPU_ScalarType z = detail::pythag(f, h);
->>>>>>> upstream/1.5.1
               e[i - 1] = z;
               c = f / z;
               s = h / z;
@@ -453,11 +236,7 @@ namespace viennacl
               g = -x * s + g * c;
               h = y * s;
               y = y * c;
-<<<<<<< HEAD
-              
-=======
 
->>>>>>> upstream/1.5.1
               cs1[i] = c;
               ss1[i] = s;
 
@@ -471,11 +250,7 @@ namespace viennacl
               cs2[i] = c;
               ss2[i] = s;
             }
-<<<<<<< HEAD
-            
-=======
 
->>>>>>> upstream/1.5.1
             {
               viennacl::copy(cs1, tmp1);
               viennacl::copy(ss1, tmp2);
@@ -489,11 +264,7 @@ namespace viennacl
 
               givens_prev(vcl_u, tmp1, tmp2, m, l, k);
             }
-<<<<<<< HEAD
-            
-=======
 
->>>>>>> upstream/1.5.1
             e[l] = 0.0;
             e[k] = f;
             q[k] = x;
@@ -501,11 +272,7 @@ namespace viennacl
 
         }
 
-<<<<<<< HEAD
-        
-=======
 
->>>>>>> upstream/1.5.1
         viennacl::copy(signs_v, tmp1);
         change_signs(vcl_v, tmp1, n);
 
@@ -514,60 +281,6 @@ namespace viennacl
         detail::transpose(vcl_v);
       }
 
-<<<<<<< HEAD
-      template <typename SCALARTYPE, unsigned int ALIGNMENT>
-      void eye(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A)
-      {
-      
-        std::vector<SCALARTYPE> foo(A.size1() * A.size1(), 0);
-        
-        for(std::size_t i = 0; i < A.size1(); i++)
-        {
-          foo[i*A.size1() + i] = 1;
-        }
-
-        viennacl::fast_copy(&foo[0], &foo[0] + foo.size(), A);
-      }
-      
-      template <typename SCALARTYPE, unsigned int ALIGNMENT>
-      void copy_vec(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
-                    viennacl::vector<SCALARTYPE, ALIGNMENT>& V,
-                    std::size_t row_start, 
-                    std::size_t col_start, 
-                    bool copy_col
-      )
-      {
-
-        std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : SVD_COPY_ROW_KERNEL;
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(),
-                                                                  kernel_name);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A, 
-                                      V, 
-                                      static_cast<cl_uint>(row_start), 
-                                      static_cast<cl_uint>(col_start),
-                                      copy_col ? static_cast<cl_uint>(A.size1())
-                                               : static_cast<cl_uint>(A.size2()),
-                                      static_cast<cl_uint>(A.internal_size2())
-                              ));
-
-      }
-
-      template <typename SCALARTYPE, unsigned int ALIGNMENT>
-      bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
-                          viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
-                          viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
-                          std::size_t start) 
-      {
-
-        std::size_t row_start = start, col_start = start;
-
-        if(row_start + 1 >= A.size1()) 
-          return false;
-
-        std::vector<float> tmp(A.size1(), 0);
-=======
 
       /*template <typename SCALARTYPE, unsigned int ALIGNMENT>
       bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
@@ -583,22 +296,15 @@ namespace viennacl
           return false;
 
         std::vector<SCALARTYPE> tmp(A.size1(), 0);
->>>>>>> upstream/1.5.1
 
         copy_vec(A, D, row_start, col_start, true);
         fast_copy(D.begin(), D.begin() + (A.size1() - row_start), tmp.begin() + row_start);
 
         detail::householder_vector(tmp, row_start);
-<<<<<<< HEAD
-        fast_copy(tmp, D);
-
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_HOUSEHOLDER_COL_KERNEL);
-=======
 
         fast_copy(tmp, D);
 
         viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_COL_KERNEL);
->>>>>>> upstream/1.5.1
 
         //kernel.global_work_size(0, A.size1() << 1);
 
@@ -612,14 +318,6 @@ namespace viennacl
                                       static_cast<cl_uint>(A.size2()),
                                       static_cast<cl_uint>(A.internal_size2()),
                                       static_cast<cl_uint>(Q.internal_size2()),
-<<<<<<< HEAD
-                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
-                              ));
-
-        return true;
-      }
-
-=======
                                       viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
                               ));
 
@@ -671,24 +369,10 @@ namespace viennacl
       }
 
       /*
->>>>>>> upstream/1.5.1
       template <typename SCALARTYPE, unsigned int ALIGNMENT>
       bool householder_r(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
                           viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
                           viennacl::vector<SCALARTYPE, ALIGNMENT>& S,
-<<<<<<< HEAD
-                          std::size_t start)
-      {
-      
-        std::size_t row_start = start, col_start = start + 1;
-        if(col_start + 1 >= A.size2()) 
-          return false;
-
-        std::vector<float> tmp(A.size2(), 0);
-
-        copy_vec(A, S, row_start, col_start, false);
-        fast_copy(S.begin(), S.begin() + (A.size2() - col_start), tmp.begin() + col_start);
-=======
                           vcl_size_t start)
       {
 
@@ -704,16 +388,11 @@ namespace viennacl
         fast_copy(S.begin(),
                   S.begin() + (A.size2() - col_start),
                   tmp.begin() + col_start);
->>>>>>> upstream/1.5.1
 
         detail::householder_vector(tmp, col_start);
         fast_copy(tmp, S);
 
-<<<<<<< HEAD
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_HOUSEHOLDER_ROW_KERNEL);
-=======
         viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_ROW_KERNEL);
->>>>>>> upstream/1.5.1
 
         viennacl::ocl::enqueue(kernel(
                                       A,
@@ -725,41 +404,6 @@ namespace viennacl
                                       static_cast<cl_uint>(A.size2()),
                                       static_cast<cl_uint>(A.internal_size2()),
                                       static_cast<cl_uint>(Q.internal_size2()),
-<<<<<<< HEAD
-                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
-                                ));
-        return true;
-      }
-
-      template <typename SCALARTYPE, unsigned int ALIGNMENT>
-      void bidiag_pack(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
-                        viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
-                        viennacl::vector<SCALARTYPE, ALIGNMENT>& S
-                      )
-      {
-        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::svd<float, 1>::program_name(), SVD_BIDIAG_PACK_KERNEL);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A, 
-                                      D, 
-                                      S,
-                                      static_cast<cl_uint>(A.size1()), 
-                                      static_cast<cl_uint>(A.size2()),
-                                      static_cast<cl_uint>(A.internal_size2())
-                                    ));
-      }
-
-      template <typename SCALARTYPE, unsigned int ALIGNMENT>
-      void bidiag(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Ai,
-                  viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& QL,
-                  viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& QR)
-      {
-        std::size_t row_num = Ai.size1();
-        std::size_t col_num = Ai.size2();
-
-        std::size_t to = std::min(row_num, col_num);
-        std::size_t big_to = std::max(row_num, col_num);
-=======
                                       viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
                                 ));
         return true;
@@ -821,20 +465,10 @@ namespace viennacl
 
         vcl_size_t to = std::min(row_num, col_num);
         vcl_size_t big_to = std::max(row_num, col_num);
->>>>>>> upstream/1.5.1
 
         //for storing householder vector
         viennacl::vector<SCALARTYPE, ALIGNMENT> hh_vector(big_to);
 
-<<<<<<< HEAD
-        eye(QL);
-        eye(QR);
-
-        for(std::size_t i = 0; i < to; i++) 
-        {
-          householder_c(Ai, QL, hh_vector, i);
-          householder_r(Ai, QR, hh_vector, i);
-=======
         QL = viennacl::identity_matrix<SCALARTYPE>(QL.size1(), ctx);
         QR = viennacl::identity_matrix<SCALARTYPE>(QR.size1(), ctx);
 
@@ -842,65 +476,18 @@ namespace viennacl
         {
           householder_c(Ai, QL, hh_vector, i, i);
           householder_r(Ai, QR, hh_vector, i, i+1);
->>>>>>> upstream/1.5.1
         }
       }
 
     } // namespace detail
 
 
-<<<<<<< HEAD
-    /** @brief Computes the singular value decomposition of a matrix A. Experimental - works for single precision (float) only. Experimental in 1.3.x
-     * 
-=======
     /** @brief Computes the singular value decomposition of a matrix A. Experimental in 1.3.x
      *
->>>>>>> upstream/1.5.1
      * @param A     The input matrix. Will be overwritten with a diagonal matrix containing the singular values on return
      * @param QL    The left orthogonal matrix
      * @param QR    The right orthogonal matrix
      */
-<<<<<<< HEAD
-    template <unsigned int ALIGNMENT>
-    void svd(viennacl::matrix<float, row_major, ALIGNMENT> & A,
-              viennacl::matrix<float, row_major, ALIGNMENT> & QL,
-              viennacl::matrix<float, row_major, ALIGNMENT> & QR) 
-    {
-      typedef float SCALARTYPE;
-      
-      viennacl::linalg::kernels::svd<SCALARTYPE, 1>::init();
-
-      std::size_t row_num = A.size1();
-      std::size_t col_num = A.size2();
-
-      std::size_t to = std::min(row_num, col_num);
-
-
-      viennacl::vector<SCALARTYPE, ALIGNMENT> d(to);
-      viennacl::vector<SCALARTYPE, ALIGNMENT> s(to + 1);
-      
-      // first stage
-      detail::bidiag(A, QL, QR);
-      detail::bidiag_pack(A, d, s);
-
-      // second stage
-      boost::numeric::ublas::vector<SCALARTYPE> dh(to, 0.0f);
-      boost::numeric::ublas::vector<SCALARTYPE> sh(to + 1, 0.0f);
-
-      boost::numeric::ublas::matrix<float> h_U(row_num, row_num);
-      boost::numeric::ublas::matrix<float> h_V(col_num, col_num);
-
-      fast_copy(d, dh);
-      fast_copy(s, sh);
-
-      detail::svd_qr_shift( QL, QR, dh, sh);
-
-      boost::numeric::ublas::matrix<float> h_Sigma(row_num, col_num);
-      h_Sigma.clear();
-
-      for (std::size_t i = 0; i < to; i++)
-        h_Sigma(i, i) = dh(i);
-=======
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void svd(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
               viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QL,
@@ -937,7 +524,6 @@ namespace viennacl
 
       for (vcl_size_t i = 0; i < to; i++)
         h_Sigma(i, i) = dh[i];
->>>>>>> upstream/1.5.1
 
       copy(h_Sigma, A);
     }
diff --git a/viennacl/linalg/vector_operations.hpp b/viennacl/linalg/vector_operations.hpp
index 30832ff..9b6eb51 100644
--- a/viennacl/linalg/vector_operations.hpp
+++ b/viennacl/linalg/vector_operations.hpp
@@ -31,8 +31,6 @@
 #include "viennacl/traits/start.hpp"
 #include "viennacl/traits/handle.hpp"
 #include "viennacl/traits/stride.hpp"
-<<<<<<< HEAD
-=======
 #include "viennacl/linalg/host_based/vector_operations.hpp"
 
 #ifdef VIENNACL_WITH_OPENCL
@@ -42,94 +40,11 @@
 #ifdef VIENNACL_WITH_CUDA
   #include "viennacl/linalg/cuda/vector_operations.hpp"
 #endif
->>>>>>> upstream/1.5.1
 
 namespace viennacl
 {
   namespace linalg
   {
-<<<<<<< HEAD
-    /** @brief Assign a vector (-range/-slice) to another vector (-range/slice).
-    *
-    * Computes vec1 += vec2.
-    * 
-    * @param vec1  The result. 
-    * @param vec2  The addend
-    */
-    template <typename V1, typename V2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                >::type
-    assign(V1 & vec1,
-           const V2 & vec2)
-    {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_add()!");
-      
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "assign");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)))
-                            );
-    }
-    
-    /** @brief Addition of two vectors.
-    *
-    * @param vec1  The first addend. 
-    * @param vec2  The second addend.
-    * @param result The result vector.
-    */
-    template <typename V1, typename V2, typename V3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_vector<V3>::value
-                                >::type
-    add(const V1 & vec1, 
-        const V2 & vec2, 
-        V3 & result)
-    {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in add()!");
-
-      //unsigned int size = std::min(viennacl::traits::internal_size(vec1),
-      //                             viennacl::traits::internal_size(vec2));
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "add");
-      
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)) )
-                            );
-=======
     template <typename T, typename ScalarType1>
     void av(vector_base<T> & vec1,
             vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
@@ -156,7 +71,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -165,30 +79,6 @@ namespace viennacl
               vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
               vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_add()!");
-      
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_add");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)))
-                            );
-=======
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
       assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
 
@@ -218,7 +108,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -268,33 +157,6 @@ namespace viennacl
     template <typename T>
     void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in sub()!");
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sub");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)) )
-                            );        
-=======
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
       {
         case viennacl::MAIN_MEMORY:
@@ -315,7 +177,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -327,28 +188,6 @@ namespace viennacl
     template <typename T>
     void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_sub()!");
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_sub");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)))
-                            );        
-=======
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in vector_swap()"));
 
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
@@ -371,7 +210,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -388,28 +226,6 @@ namespace viennacl
     void element_op(vector_base<T> & vec1,
                     vector_expression<const vector_base<T>, const vector_base<T>, OP> const & proxy)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mult()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),
-                               alpha,
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)))
-                            );        
-=======
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(proxy) && bool("Incompatible vector sizes in element_op()"));
 
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
@@ -432,7 +248,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
     /** \cond */
@@ -537,28 +352,6 @@ namespace viennacl
                          vector_base<T> const & vec2,
                          scalar<T> & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mult()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),
-                               static_cast<value_type>(alpha),
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)))
-                            );        
-=======
       assert( vec1.size() == vec2.size() && bool("Size mismatch") );
 
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
@@ -581,7 +374,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
     // vector expression on lhs
@@ -590,24 +382,8 @@ namespace viennacl
                          vector_base<T> const & vec2,
                          scalar<T> & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),
-                               alpha)
-                            );
-=======
       viennacl::vector<T> temp = vec1;
       inner_prod_impl(temp, vec2, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -617,24 +393,8 @@ namespace viennacl
                          viennacl::vector_expression<LHS, RHS, OP> const & vec2,
                          scalar<T> & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), 
-                                cl_uint(viennacl::traits::start(vec)), 
-                                cl_uint(viennacl::traits::stride(vec)), 
-                                cl_uint(viennacl::traits::size(vec)), 
-                               static_cast<value_type>(alpha))
-                            );        
-=======
       viennacl::vector<T> temp = vec2;
       inner_prod_impl(vec1, temp, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -645,32 +405,9 @@ namespace viennacl
                          viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
                          scalar<T> & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in divide()!");
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "divide");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), 
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),
-                               alpha,
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)))
-                            );
-=======
       viennacl::vector<T> temp1 = vec1;
       viennacl::vector<T> temp2 = vec2;
       inner_prod_impl(temp1, temp2, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -687,21 +424,6 @@ namespace viennacl
                         vector_base<T> const & vec2,
                         T & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_divide");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)), 
-                               alpha) 
-                            );
-=======
       assert( vec1.size() == vec2.size() && bool("Size mismatch") );
 
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
@@ -724,7 +446,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
     // vector expression on lhs
@@ -733,38 +454,8 @@ namespace viennacl
                         vector_base<T> const & vec2,
                         T & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mul_add()!");
-      
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               alpha,
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)))
-                            );        
-=======
       viennacl::vector<T> temp = vec1;
       inner_prod_cpu(temp, vec2, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -774,37 +465,8 @@ namespace viennacl
                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
                         T & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mul_add()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mul_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)), 
-                               static_cast<value_type>(alpha),
-                               viennacl::traits::handle(vec2), 
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)), 
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)))
-                            );
-=======
       viennacl::vector<T> temp = vec2;
       inner_prod_cpu(vec1, temp, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -815,33 +477,9 @@ namespace viennacl
                         viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
                         S3 & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_mul_add()!");
-      
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_add");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)), 
-                               alpha));
-=======
       viennacl::vector<S3> temp1 = vec1;
       viennacl::vector<S3> temp2 = vec2;
       inner_prod_cpu(temp1, temp2, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -857,28 +495,6 @@ namespace viennacl
                          vector_tuple<T> const & y_tuple,
                          vector_base<T> & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_mul_add()!");
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mul_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)), 
-                                cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)), 
-                                cl_uint(viennacl::traits::size(vec2)), 
-                               value_type(alpha)));
-=======
       assert( x.size() == y_tuple.const_at(0).size() && bool("Size mismatch") );
       assert( result.size() == y_tuple.const_size() && bool("Number of elements does not match result size") );
 
@@ -902,7 +518,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -915,34 +530,6 @@ namespace viennacl
     void norm_1_impl(vector_base<T> const & vec,
                      scalar<T> & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mul_sub()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_sub");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)), 
-                               alpha,
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)), 
-                               viennacl::traits::handle(result),
-                                cl_uint(viennacl::traits::start(result)),
-                                cl_uint(viennacl::traits::stride(result)),
-                                cl_uint(viennacl::traits::size(result)))
-                            );
-=======
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
         case viennacl::MAIN_MEMORY:
@@ -963,7 +550,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
 
@@ -976,32 +562,8 @@ namespace viennacl
     void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                      S2 & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_mul_sub()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_sub");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)), 
-                               alpha)
-                            );        
-=======
       viennacl::vector<typename viennacl::result_of::cpu_value_type<S2>::type> temp = vec;
       norm_1_impl(temp, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -1015,29 +577,6 @@ namespace viennacl
     void norm_1_cpu(vector_base<T> const & vec,
                     T & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_div_add()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)), 
-                               alpha)
-                            );
-=======
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
         case viennacl::MAIN_MEMORY:
@@ -1058,7 +597,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
     /** @brief Computes the l^1-norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
@@ -1070,110 +608,13 @@ namespace viennacl
     void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                     S2 & result)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_div_sub()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_sub");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)),
-                               alpha)
-                            );
-=======
       viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
       norm_1_cpu(temp, result);
->>>>>>> upstream/1.5.1
     }
 
 
 
 
-<<<<<<< HEAD
-    //implementation of inner product:
-    //namespace {
-    /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
-     *
-     * @param vec1 The first vector
-     * @param vec2 The second vector
-     * @param result The result scalar (on the gpu)
-     * @param dummy  Dummy parameter used for SFINAE
-     */
-    template <typename V1, typename V2, typename S3>
-    void inner_prod_impl(V1 const & vec1,
-                         V2 const & vec2,
-                         S3 & result,
-                         typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                       && viennacl::is_vector<V2>::value
-                                                       && viennacl::is_scalar<S3>::value
-#ifdef _MSC_VER
-                                                     >::type * dummy = 0)
-#else
-                                                     >::type * dummy)
-#endif                                                   
-    {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-    
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inner_prod_impl()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inner_prod");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      
-      static viennacl::vector<value_type> temp(work_groups);
-      
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( (k.global_work_size() / k.local_work_size()) == 1 
-              || (k.global_work_size() / k.local_work_size()) == 2 
-              || (k.global_work_size() / k.local_work_size()) == 4
-              || (k.global_work_size() / k.local_work_size()) == 8
-              || (k.global_work_size() / k.local_work_size()) == 16
-              || (k.global_work_size() / k.local_work_size()) == 32
-              || (k.global_work_size() / k.local_work_size()) == 64
-              || (k.global_work_size() / k.local_work_size()) == 128
-              || (k.global_work_size() / k.local_work_size()) == 256
-              || (k.global_work_size() / k.local_work_size()) == 512 );
-              
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),
-                                cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),
-                                cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                               temp));        
-
-      viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
-      
-      ksum.local_work_size(0, work_groups);
-      ksum.global_work_size(0, work_groups);
-      viennacl::ocl::enqueue(ksum(viennacl::traits::handle(temp),
-                                  cl_uint(viennacl::traits::start(temp)),
-                                  cl_uint(viennacl::traits::stride(temp)),
-                                  cl_uint(viennacl::traits::size(temp)),
-                                  result)
-                            );
-=======
     /** @brief Computes the l^2-norm of a vector - dispatcher interface
     *
     * @param vec The vector
@@ -1203,7 +644,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
     /** @brief Computes the l^2-norm of a vector - interface for a vector expression. Creates a temporary.
@@ -1224,7 +664,6 @@ namespace viennacl
     *
     * @param vec The vector
     * @param result The result scalar
-    * @param dummy  Dummy parameter used for SFINAE
     */
     template <typename T>
     void norm_2_cpu(vector_base<T> const & vec,
@@ -1250,44 +689,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
-<<<<<<< HEAD
-      
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      viennacl::vector<value_type> temp(work_groups);
-
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( (k.global_work_size() / k.local_work_size()) == 1 
-             || (k.global_work_size() / k.local_work_size()) == 2 
-             || (k.global_work_size() / k.local_work_size()) == 4
-             || (k.global_work_size() / k.local_work_size()) == 8
-             || (k.global_work_size() / k.local_work_size()) == 16
-             || (k.global_work_size() / k.local_work_size()) == 32
-             || (k.global_work_size() / k.local_work_size()) == 64
-             || (k.global_work_size() / k.local_work_size()) == 128
-             || (k.global_work_size() / k.local_work_size()) == 256
-             || (k.global_work_size() / k.local_work_size()) == 512 );
-               
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),                                 
-                                viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                                temp));        
-      
-      viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
-      
-      ksum.local_work_size(0, work_groups);
-      ksum.global_work_size(0, work_groups);
-      viennacl::ocl::enqueue(ksum(viennacl::traits::handle(temp),
-                                  cl_uint(viennacl::traits::start(temp)),
-                                  cl_uint(viennacl::traits::stride(temp)),
-                                  cl_uint(viennacl::traits::size(temp)),
-                                  result)
-                            );
-=======
->>>>>>> upstream/1.5.1
     }
 
     /** @brief Computes the l^2-norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
@@ -1310,7 +711,6 @@ namespace viennacl
     *
     * @param vec The vector
     * @param result The result scalar
-    * @param dummy  Dummy parameter used for SFINAE
     */
     template <typename T>
     void norm_inf_impl(vector_base<T> const & vec,
@@ -1338,44 +738,6 @@ namespace viennacl
       }
     }
 
-<<<<<<< HEAD
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      viennacl::vector<value_type> temp(work_groups);
-        
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( (k.global_work_size() / k.local_work_size()) == 1 
-             || (k.global_work_size() / k.local_work_size()) == 2 
-             || (k.global_work_size() / k.local_work_size()) == 4
-             || (k.global_work_size() / k.local_work_size()) == 8
-             || (k.global_work_size() / k.local_work_size()) == 16
-             || (k.global_work_size() / k.local_work_size()) == 32
-             || (k.global_work_size() / k.local_work_size()) == 64
-             || (k.global_work_size() / k.local_work_size()) == 128
-             || (k.global_work_size() / k.local_work_size()) == 256
-             || (k.global_work_size() / k.local_work_size()) == 512 );
-               
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                  cl_uint(viennacl::traits::start(vec)),
-                                  cl_uint(viennacl::traits::stride(vec)),
-                                  cl_uint(viennacl::traits::size(vec)),                                 
-                                 viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                                 temp)
-                              );
-
-        viennacl::ocl::kernel & sqrt_sum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sqrt_sum");
-        
-        sqrt_sum.local_work_size(0, work_groups);
-        sqrt_sum.global_work_size(0, work_groups);
-        viennacl::ocl::enqueue(
-                        sqrt_sum(viennacl::traits::handle(temp),
-                                  cl_uint(viennacl::traits::start(temp)),
-                                  cl_uint(viennacl::traits::stride(temp)),
-                                  cl_uint(viennacl::traits::size(temp)),
-                                 result)
-                              );
-=======
     /** @brief Computes the supremum norm of a vector - interface for a vector expression. Creates a temporary.
     *
     * @param vec    The vector expression
@@ -1387,7 +749,6 @@ namespace viennacl
     {
       viennacl::vector<T> temp = vec;
       norm_inf_impl(temp, result);
->>>>>>> upstream/1.5.1
     }
 
 
@@ -1395,7 +756,6 @@ namespace viennacl
     *
     * @param vec The vector
     * @param result The result scalar
-    * @param dummy  Dummy parameter used for SFINAE
     */
     template <typename T>
     void norm_inf_cpu(vector_base<T> const & vec,
@@ -1421,47 +781,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
-<<<<<<< HEAD
-      
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      viennacl::vector<value_type> temp(work_groups);
-        
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( work_groups == 1 
-             || work_groups == 2 
-             || work_groups == 4
-             || work_groups == 8
-             || work_groups == 16
-             || work_groups == 32
-             || work_groups == 64
-             || work_groups == 128
-             || work_groups == 256
-             || work_groups == 512 );
-               
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),                                 
-                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                               temp));
-      //viennacl::ocl::get_queue().finish();
-      
-      //part 2: parallel reduction of reduced kernel:
-      viennacl::ocl::kernel & max_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "vmax");
-      max_kernel.local_work_size(0, work_groups);
-      max_kernel.global_work_size(0, work_groups);
-      
-      viennacl::ocl::enqueue(
-                       max_kernel(viennacl::traits::handle(temp),
-                                   cl_uint(viennacl::traits::start(temp)),
-                                   cl_uint(viennacl::traits::stride(temp)),
-                                   cl_uint(viennacl::traits::size(temp)),
-                                  result)
-                            );
-=======
->>>>>>> upstream/1.5.1
     }
 
     /** @brief Computes the supremum norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
@@ -1489,32 +808,6 @@ namespace viennacl
     template <typename T>
     vcl_size_t index_norm_inf(vector_base<T> const & vec)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::handle<cl_mem> h = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "index_norm_inf");
-      //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-
-      k.global_work_size(0, k.local_work_size());
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec),
-                                cl_uint(viennacl::traits::start(vec)),
-                                cl_uint(viennacl::traits::stride(vec)),
-                                cl_uint(viennacl::traits::size(vec)),                                 
-                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                               viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
-      
-      //read value:
-      cl_uint result;
-      cl_int err;
-      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-      return result;
-=======
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
         case viennacl::MAIN_MEMORY:
@@ -1532,7 +825,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
     /** @brief Computes the supremum norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
@@ -1561,27 +853,6 @@ namespace viennacl
                         vector_base<T> & vec2,
                         T alpha, T beta)
     {
-<<<<<<< HEAD
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "plane_rotation");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                                cl_uint(viennacl::traits::start(vec1)),
-                                cl_uint(viennacl::traits::stride(vec1)),                                 
-                                cl_uint(viennacl::traits::size(vec1)),                                 
-                               viennacl::traits::handle(vec2),
-                                cl_uint(viennacl::traits::start(vec2)),
-                                cl_uint(viennacl::traits::stride(vec2)),                                 
-                                cl_uint(viennacl::traits::size(vec2)),                                 
-                               alpha,
-                               beta)
-                            );
-=======
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
       {
         case viennacl::MAIN_MEMORY:
@@ -1602,7 +873,6 @@ namespace viennacl
         default:
           throw memory_exception("not implemented");
       }
->>>>>>> upstream/1.5.1
     }
 
   } //namespace linalg
diff --git a/viennacl/matrix.hpp b/viennacl/matrix.hpp
index bd21d6a..9ac27cb 100644
--- a/viennacl/matrix.hpp
+++ b/viennacl/matrix.hpp
@@ -1,1083 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_MATRIX_HPP_
-#define VIENNACL_MATRIX_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix.hpp
-    @brief Implementation of the dense matrix class
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/matrix_operations.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/tools/matrix_size_deducer.hpp"
-#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
-#include "viennacl/meta/result_of.hpp"
-#include "viennacl/meta/enable_if.hpp"
-
-namespace viennacl
-{
-    /** @brief A tag for row-major storage of a dense matrix. */
-    struct row_major
-    {
-      /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
-      *
-      * @param i   row index
-      * @param j   column index
-      * @param num_rows  number of entries per row (including alignment)
-      * @param num_cols  number of entries per column (including alignment)
-      */
-      static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t num_cols)
-      {
-        return i * num_cols + j;
-      }
-      
-      static vcl_size_t internal_size1(vcl_size_t rows, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(rows, alignment);;
-      }
-      
-      static vcl_size_t internal_size2(vcl_size_t cols, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(cols, alignment);
-      }
-    };
-
-    struct column_major
-    {
-      /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
-      *
-      * @param i   row index
-      * @param j   column index
-      * @param num_rows  number of entries per row (including alignment)
-      * @param num_cols  number of entries per column (including alignment)
-      */
-      static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t num_cols)
-      {
-        return i + j * num_rows;
-      }
-      
-      static vcl_size_t internal_size1(vcl_size_t rows, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(rows, alignment);
-      }
-      
-      static vcl_size_t internal_size2(vcl_size_t cols, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(cols, alignment);
-      }
-    };
-    
-    template <typename LHS, typename RHS, typename OP>
-    class matrix_expression
-    {
-      public:
-        ///** @brief Extracts the vector type from the two operands.
-        //*/
-        //typedef typename viennacl::tools::VECTOR_EXTRACTOR<LHS, RHS>::ResultType    VectorType;
-      
-        matrix_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
-        
-        /** @brief Get left hand side operand
-        */
-        LHS & lhs() const { return _lhs; }
-        /** @brief Get right hand side operand
-        */
-        RHS & rhs() const { return _rhs; }
-        
-        /** @brief Returns the size of the result vector */
-        std::size_t size1() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size1(_lhs, _rhs); }
-        std::size_t size2() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size2(_lhs, _rhs); }
-        
-      private:
-        /** @brief The left hand side operand */
-        typename result_of::matrix_expression_internal_storage<LHS>::type _lhs;
-        /** @brief The right hand side operand */
-        typename result_of::matrix_expression_internal_storage<RHS>::type _rhs;
-    };
-    
-    
-    /** @brief A tag indicating iteration along increasing row index of a matrix */
-    struct row_iteration {};
-    
-    /** @brief A tag indicating iteration along increasing columns index of a matrix */
-    struct col_iteration {};
-
-    //STL-like iterator. TODO: STL-compliance...
-    template <typename ROWCOL, typename MATRIXTYPE>
-    class matrix_iterator
-    {
-        typedef matrix_iterator<ROWCOL, MATRIXTYPE>    self_type;
-      public:
-        typedef typename MATRIXTYPE::value_type       value_type;
-        
-        matrix_iterator(MATRIXTYPE & mat, 
-                        std::size_t start_row,
-                        std::size_t start_col) : mat_(mat), row_(start_row), col_(start_col) {};
-        
-        value_type operator*(void) { return mat_(row_, col_); }
-        self_type & operator++(void) { viennacl::tools::MATRIX_ITERATOR_INCREMENTER<ROWCOL, MATRIXTYPE>::apply(mat_, row_, col_); return *this; }
-        self_type & operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
-        
-        bool operator==(self_type const & other) { return (row_ == other.row_) && (col_ == other.col_); }
-        bool operator!=(self_type const & other) { return !(*this == other); }
-        
-        vcl_size_t index1() { return row_; }
-        vcl_size_t index2() { return col_; }
-        
-        MATRIXTYPE & operator()(void) const { return mat_; }
-      
-      private:
-        MATRIXTYPE & mat_;
-        vcl_size_t row_;
-        vcl_size_t col_;
-    };
-
-    /** @brief A dense matrix class
-    *
-    * @tparam SCALARTYPE   The underlying scalar type (either float or double)
-    * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
-    * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
-    */
-    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    class matrix
-    {
-      typedef matrix<SCALARTYPE, F, ALIGNMENT>          self_type;
-    public:
-      
-      typedef matrix_iterator<row_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator1;
-      typedef matrix_iterator<col_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator2;
-      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
-      typedef vcl_size_t                                                          size_type;
-      
-      /** @brief The default constructor. Does not allocate any memory. */
-      matrix() : rows_(0), columns_(0)
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-      };
-      
-      /** @brief Creates the matrix with the given dimensions
-      *
-      * @param rows     Number of rows
-      * @param columns  Number of columns
-      */
-      explicit matrix(size_type rows, size_type columns) :
-        rows_(rows), columns_(columns)
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-      }
-
-      explicit matrix(cl_mem mem, size_type rows, size_type columns) :
-        rows_(rows), columns_(columns)
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = mem;
-        elements_.inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
-      }
-
-      template <typename LHS, typename RHS, typename OP>
-      matrix(matrix_expression< LHS, RHS, OP> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-      
-      // matrix_range
-
-      matrix(matrix_range<self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-
-      matrix(matrix_range<const self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-      
-      // matrix_slice
-
-      matrix(matrix_slice<self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-
-      matrix(matrix_slice<const self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-
-
-      //copy constructor:
-      matrix(const matrix<SCALARTYPE, F, ALIGNMENT> & mat) :
-        rows_(mat.size1()), columns_(mat.size2()),
-        elements_(viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size()))
-      {
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix<SCALARTYPE, F, ALIGNMENT> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        return *this;
-      }
-      
-      
-      // A = trans(B). Currently achieved in CPU memory
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                            const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                            op_trans> & proxy)
-      {
-        assert(elements_.get() != proxy.lhs().handle().get() && "Self-assignment of matrix transpose not implemented");
-        assert(proxy.lhs().size1() == size2() && "Matrix dimensions do not match!");
-        assert(proxy.lhs().size2() == size1() && "Matrix dimensions do not match!");
-
-        resize(proxy.lhs().size2(), proxy.lhs().size1(), false);
-        
-        std::vector<SCALARTYPE> temp(proxy.lhs().internal_size());
-        
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         proxy.lhs().handle().get(), CL_TRUE, 0,
-                                         sizeof(SCALARTYPE)*proxy.lhs().internal_size(),
-                                         &(temp[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-
-        // now transpose it
-        std::vector<SCALARTYPE> temp_trans(internal_size());
-
-        for (vcl_size_t i=0; i<proxy.lhs().size1(); ++i)
-          for (vcl_size_t j=0; j<proxy.lhs().size2(); ++j)
-            temp_trans[F::mem_index(j,i, internal_size1(), internal_size2())] 
-             = temp[F::mem_index(i,j, proxy.lhs().internal_size1(), proxy.lhs().internal_size2())];
-
-        // write back
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                   sizeof(SCALARTYPE)*internal_size(),
-                                                                   &(temp_trans[0]));
-          
-        return *this;
-      }
-
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<self_type> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        viennacl::linalg::assign(*this, mat);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<const self_type> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        viennacl::linalg::assign(*this, mat);
-        return *this;
-      }
-
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_slice<self_type> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        viennacl::linalg::assign(*this, mat);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_slice<const self_type> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        viennacl::linalg::assign(*this, mat);
-        return *this;
-      }
-
-
-      /** @brief Resizes the matrix.
-      *   Existing entries can be preserved, but 
-      *
-      * @param rows       New number of rows
-      * @param columns    New number of columns
-      * @param preserve   If true, existing values are preserved. 
-      */
-      void resize(size_type rows, size_type columns, bool preserve = true)
-      {
-        assert(rows > 0 && columns > 0);
-        if (preserve)
-        {
-          //get old entries:
-          std::vector< SCALARTYPE > old_entries(internal_size());
-          cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), //src
-                                           elements_.get(), //dest
-                                           CL_TRUE, //blocking
-                                           0, //offset
-                                           sizeof(SCALARTYPE)*internal_size(), //size
-                                           &(old_entries[0]), //destination
-                                           0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-          
-          //set up entries of new matrix:
-          std::vector< SCALARTYPE > new_entries(F::internal_size1(rows, ALIGNMENT) * F::internal_size2(columns, ALIGNMENT));
-          for (size_type i=0; i<rows; ++i)
-          {
-            if (i >= rows_)
-              continue;
-              
-            for (size_type j=0; j<columns; ++j)
-            {
-              if (j >= columns_)
-                continue;
-              new_entries[F::mem_index(i, j, F::internal_size1(rows, ALIGNMENT), F::internal_size2(columns, ALIGNMENT))] 
-                 = old_entries[F::mem_index(i, j, internal_size1(), internal_size2())];
-            }
-          }
-          
-          //copy new entries to GPU:
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, new_entries);
-          rows_ = rows;
-          columns_ = columns;
-        }
-        else //discard old entries:
-        {
-          rows_ = rows;
-          columns_ = columns;
-          
-          std::vector< SCALARTYPE > new_entries(F::internal_size1(rows, ALIGNMENT) * F::internal_size2(columns, ALIGNMENT));
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, new_entries);
-        }
-      }
-      
-      
-      //read-write access to an element of the vector
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index)
-      {
-        return entry_proxy<SCALARTYPE>(F::mem_index(row_index, col_index, internal_size1(), internal_size2()), elements_);
-      }
-      
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<SCALARTYPE> operator()(size_type row_index, size_type col_index) const
-      {
-        scalar<SCALARTYPE> tmp;
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                  elements_.get(),
-                                  tmp.handle().get(),
-                                  sizeof(SCALARTYPE) * F::mem_index(row_index, col_index, internal_size1(), internal_size2()),
-                                  0,
-                                  sizeof(SCALARTYPE),
-                                  0,
-                                  NULL,
-                                  NULL);
-        //assert(err == CL_SUCCESS);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      }
-      
-
-      matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         op_add >
-      operator + (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  op_add > (*this, other);
-      }
-
-      // operator +=
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_range< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_slice< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                               const vector<SCALARTYPE, A2>,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::rank_1_update(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_expression< const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                                                        const vector<SCALARTYPE, A2>,
-                                                                                                        op_prod >,
-                                                                               const SCALARTYPE,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::scaled_rank_1_update(*this, proxy.rhs(), proxy.lhs().lhs(), proxy.lhs().rhs());
-        return *this;
-      }
-      
-      // operator -
-      matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         op_sub >
-      operator - (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  op_sub > (*this, other);
-      }
-      
-      // operator -=
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_range< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_slice< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                               const vector<SCALARTYPE, A2>,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::scaled_rank_1_update(*this, static_cast<SCALARTYPE>(-1.0), proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_expression< const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                                                        const vector<SCALARTYPE, A2>,
-                                                                                                        op_prod >,
-                                                                               const SCALARTYPE,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::scaled_rank_1_update(*this, static_cast<SCALARTYPE>(-1.0) * proxy.rhs(), proxy.lhs().lhs(), proxy.lhs().rhs());
-        return *this;
-      }
-      
-      
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator *= (SCALARTYPE val) 
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator *= (scalar<SCALARTYPE> const & val) 
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator /= (SCALARTYPE val) 
-      {
-        viennacl::linalg::inplace_mult(*this, SCALARTYPE(1.0) / val);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator /= (scalar<SCALARTYPE> const & val) 
-      {
-        viennacl::linalg::inplace_divide(*this, val);
-        return *this;
-      }
-
-
-      //this = A * B and related (with trans())
-      template <typename MatrixType1, typename MatrixType2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< MatrixType1,
-                                                                              MatrixType2,
-                                                                              op_prod > & proxy) 
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      //this = A + B
-      template <typename T1, typename T2>
-      matrix<SCALARTYPE, F, ALIGNMENT> &
-      operator = (const matrix_expression< const T1,
-                                           const T2,
-                                           op_add > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      //this = A - B
-      template <typename T1, typename T2>
-      matrix<SCALARTYPE, F, ALIGNMENT> &
-      operator = (const matrix_expression< const T1,
-                                           const T2,
-                                           op_sub > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      
-      
-
-      //this = A - B
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                               const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                               op_sub > & proxy) 
-      {
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-
-      /** @brief Returns the number of rows */
-      const size_type & size1() const { return rows_;}
-      /** @brief Returns the number of columns */
-      const size_type & size2() const { return columns_; }
-      
-      /** @brief Resets all entries to zero */
-      void clear()
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "clear");
-        viennacl::ocl::enqueue(k(elements_,
-                                 cl_uint(0), cl_uint(0),
-                                 cl_uint(1), cl_uint(1),
-                                 cl_uint(size1()), cl_uint(size2()),
-                                 cl_uint(internal_size1()), cl_uint(internal_size2())
-                                )
-                              );
-      }
-      
-      
-      //const unsigned int row_stride() const { return roundUpToNextMultiple<unsigned int>(columns(), ALIGNMENT); }
-      /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
-      const size_type internal_size1() const { return F::internal_size1(size1(), ALIGNMENT); }
-      /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
-      const size_type internal_size2() const { return F::internal_size2(size2(), ALIGNMENT); }
-      /** @brief Returns the total amount of allocated memory in multiples of sizeof(SCALARTYPE) */
-      const size_type internal_size() const { return internal_size1() * internal_size2(); }
-      
-      /** @brief Returns the OpenCL handle */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
-      
-      #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
-      template <typename CPU_MATRIX>
-      friend void copy(const CPU_MATRIX & cpu_matrix,
-                      matrix & gpu_matrix );
-      
-      template <typename SCALARTYPE2, typename A1, typename A2>
-      friend void copy(const std::vector< std::vector<SCALARTYPE2, A1>, A2> & cpu_matrix,
-                      matrix & gpu_matrix );
-      
-      template <typename SCALARTYPE2>
-      friend void fast_copy(SCALARTYPE2 * cpu_matrix_begin,
-                            SCALARTYPE2 * cpu_matrix_end,
-                            matrix & gpu_matrix);
-      
-      #ifdef VIENNACL_HAVE_EIGEN
-      friend void copy(const Eigen::MatrixXf & cpu_matrix,
-                       matrix & gpu_matrix);
-      
-      friend void copy(const Eigen::MatrixXd & cpu_matrix,
-                       matrix & gpu_matrix);
-      #endif
-      
-      #ifdef VIENNACL_HAVE_MTL4
-      template <typename SCALARTYPE2, typename T>
-      friend void copy(const mtl::dense2D<SCALARTYPE2, T>& cpu_matrix,
-                       matrix & gpu_matrix);
-      #endif
-      #else
-      template <typename CPU_MATRIX, typename SCALARTYPE2, typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const CPU_MATRIX & cpu_matrix,
-                      matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix );
-                      
-      template <typename SCALARTYPE2, typename A1, typename A2, typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const std::vector< std::vector<SCALARTYPE2, A1>, A2> & cpu_matrix,
-                       matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix );
-      
-      template <typename SCALARTYPE2, typename F2, unsigned int ALIGNMENT2>
-      friend void fast_copy(SCALARTYPE2 * cpu_matrix_begin,
-                            SCALARTYPE2 * cpu_matrix_end,
-                            matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix);
-      
-      #ifdef VIENNACL_HAVE_EIGEN
-      template <typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const Eigen::MatrixXf & cpu_matrix,
-                matrix<float, F2, ALIGNMENT2> & gpu_matrix);
-      
-      template <typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const Eigen::MatrixXd & cpu_matrix,
-                matrix<double, F2, ALIGNMENT2> & gpu_matrix);
-      #endif
-      
-      #ifdef VIENNACL_HAVE_MTL4
-      template <typename SCALARTYPE2, typename T, typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const mtl::dense2D<SCALARTYPE2, T>& cpu_matrix,
-                       matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix);
-      #endif
-      #endif                 
-      
-    private:
-      size_type rows_;
-      size_type columns_;
-      viennacl::ocl::handle<cl_mem> elements_;
-    }; //matrix
-
-    /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
-    *
-    * @param s            STL output stream
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    std::ostream & operator<<(std::ostream & s, const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      std::vector<SCALARTYPE> tmp(gpu_matrix.internal_size());
-      cl_int err;
-      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &tmp[0], 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-      viennacl::ocl::get_queue().finish();
-      
-      s << "[" << gpu_matrix.size1() << "," << gpu_matrix.size2() << "]";
-      
-      s << "(";
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        s << "(";
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j)
-        {
-          s << tmp[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
-          if (j < gpu_matrix.size2() - 1)
-            s << ",";
-        }
-        s << ")";
-        if (i < gpu_matrix.size1() - 1)
-          s << ",";
-      }
-      s << ")";
-      return s;
-    }
-
-    /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
-    *
-    * @param s            STL output stream
-    * @param expr         A matrix expression
-    */
-    template<typename LHS, typename RHS, typename OP>
-    std::ostream & operator<<(std::ostream & s, const matrix_expression<LHS, RHS, OP> & expr)
-    {
-      typedef typename viennacl::tools::CPU_SCALAR_TYPE_DEDUCER< typename tools::CONST_REMOVER<LHS>::ResultType >::ResultType     ScalarType;
-
-      matrix<ScalarType> temp = expr;
-      s << temp;
-      return s;
-    }
-    
-    /** @brief Returns an expression template class representing a transposed matrix */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                       const matrix<SCALARTYPE, F, ALIGNMENT>,
-                       op_trans> trans(const matrix<SCALARTYPE, F, ALIGNMENT> & mat)
-    {
-      return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                op_trans>(mat, mat);
-    }
-    
-    
-    /////////////////////// transfer operations: //////////////////////////////////////
-
-    //
-    //cpu to gpu, generic type:
-    //
-    /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense matrix on the host. Type requirements: .size1() returns number of rows, .size2() returns number of columns. Access to entries via operator()
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void copy(const CPU_MATRIX & cpu_matrix,
-              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      //std::cout << "Copying CPU_MATRIX!" << std::endl;
-      //std::cout << "Size at begin: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.size1(),
-                          cpu_matrix.size2(), false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == cpu_matrix.size1()) 
-               && (gpu_matrix.size2() == cpu_matrix.size2())
-              );
-      }
-
-      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-      //std::cout << "Size at end: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
-    }
-    
-    //
-    //cpu to gpu, STL type:
-    //
-    /** @brief Copies a dense STL-type matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense matrix on the host of type std::vector< std::vector<> >. cpu_matrix[i][j] returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
-    void copy(const std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix,
-              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.size(),
-                          cpu_matrix[0].size(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == cpu_matrix.size()) 
-               && (gpu_matrix.size2() == cpu_matrix[0].size())
-              );
-      }
-
-      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    
-    
-    //
-    //cpu to gpu, another STL type:
-    //
-    /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU) without temporary. Matrix-Layout on CPU must be equal to the matrix-layout on the GPU.
-    *
-    * @param cpu_matrix_begin   Pointer to the first matrix entry. Cf. iterator concept in STL
-    * @param cpu_matrix_end     Pointer past the last matrix entry. Cf. iterator concept in STL
-    * @param gpu_matrix         A dense ViennaCL matrix
-    */
-    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void fast_copy(SCALARTYPE * cpu_matrix_begin,
-                   SCALARTYPE * cpu_matrix_end,
-                   matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
-    {
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                            sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin),
-                                                                            cpu_matrix_begin);
-    }
-    
-   
-    #ifdef VIENNACL_HAVE_EIGEN
-    /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename F, unsigned int ALIGNMENT>
-    void copy(const Eigen::MatrixXf & cpu_matrix,
-              matrix<float, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.rows(),
-                          cpu_matrix.cols(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == static_cast<std::size_t>(cpu_matrix.rows())) 
-               && (gpu_matrix.size2() == static_cast<std::size_t>(cpu_matrix.cols()))
-              );
-      }
-
-      std::vector<float> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    
-    /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename F, unsigned int ALIGNMENT>
-    void copy(const Eigen::MatrixXd & cpu_matrix,
-              matrix<double, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<double, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.rows(),
-                          cpu_matrix.cols(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == static_cast<std::size_t>(cpu_matrix.rows())) 
-               && (gpu_matrix.size2() == static_cast<std::size_t>(cpu_matrix.cols()))
-              );
-      }
-
-      std::vector<double> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    #endif
-    
-    #ifdef VIENNACL_HAVE_MTL4
-    /** @brief Copies a dense MTL matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename SCALARTYPE, typename T, typename F, unsigned int ALIGNMENT>
-    void copy(const mtl::dense2D<SCALARTYPE, T>& cpu_matrix,
-              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.num_rows(),
-                          cpu_matrix.num_cols(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == cpu_matrix.num_rows()) 
-               && (gpu_matrix.size2() == cpu_matrix.num_cols())
-              );
-      }
-
-      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    #endif
-    
-    
-    
-    
-    //
-    //gpu to cpu, generic type
-    //
-    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
-    *
-    * @param gpu_matrix   A dense ViennaCL matrix
-    * @param cpu_matrix   A dense memory on the host. Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
-              CPU_MATRIX & cpu_matrix )
-    {
-      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
-      
-      if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
-      {
-        std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        //now copy entries to cpu_matrix:
-        for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-          for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-            cpu_matrix(i,j) = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
-      }
-    }
-
-    //gpu to cpu, STL type
-    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
-    *
-    * @param gpu_matrix   A dense ViennaCL matrix
-    * @param cpu_matrix   A dense memory on the host using STL types, typically std::vector< std::vector<> > Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
-    */
-    template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
-    void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
-              std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix)
-    {
-      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
-      
-      if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) 
-         && (cpu_matrix.size() >= gpu_matrix.size1()) && (cpu_matrix[0].size() >= gpu_matrix.size2()))
-      {
-        std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        //now copy entries to cpu_matrix:
-        for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-          for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-            cpu_matrix[i][j] = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
-      }
-    }
-
-    //gpu to cpu, STL type
-    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
-    *
-    * @param gpu_matrix         A dense ViennaCL matrix
-    * @param cpu_matrix_begin   Pointer to the output memory on the CPU. User must ensure that provided memory is large enough.
-    */
-    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void fast_copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
-                   SCALARTYPE * cpu_matrix_begin)
-    {
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                       gpu_matrix.handle().get(), 
-                                       CL_TRUE, 0,
-                                       sizeof(SCALARTYPE)*gpu_matrix.internal_size(),
-                                       cpu_matrix_begin, 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-    }
-
-
-
-
-
-
-
-
-
-    // outer_prod(v1, v2) * val;
-    template<typename CPU_SCALAR, typename SCALARTYPE,unsigned int VECTOR_ALIGNMENT>
-    viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                    op_prod>,
-                                 const SCALARTYPE,
-                                 op_prod>  operator*(const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                                        const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                                        op_prod> & proxy,
-                                                     CPU_SCALAR const & val)
-    {
-      return viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                             const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                             op_prod>,
-                                          const SCALARTYPE,
-                                          op_prod>(proxy, static_cast<SCALARTYPE>(val));
-    }
-
-    // val * outer_prod(v1, v2);
-    template <typename CPU_SCALAR, typename SCALARTYPE, unsigned int VA1, unsigned int VA2>
-    viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                                                    const viennacl::vector<SCALARTYPE, VA2>,
-                                                                    op_prod>,
-                                 const SCALARTYPE,
-                                 op_prod>  operator*(CPU_SCALAR const & val,
-                                                     viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                                                                  const viennacl::vector<SCALARTYPE, VA2>,
-                                                                                  op_prod> const & proxy)
-    {
-      return viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                                                             const viennacl::vector<SCALARTYPE, VA2>,
-                                                                             op_prod>,
-                                          const SCALARTYPE,
-                                          op_prod>(proxy, static_cast<SCALARTYPE>(val));
-    }
-    
-   
-
-} //namespace viennacl
-
-#endif
-=======
 #ifndef VIENNACL_MATRIX_HPP_
 #define VIENNACL_MATRIX_HPP_
 
@@ -4126,4 +3046,3 @@ namespace viennacl
 } //namespace viennacl
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/matrix_proxy.hpp b/viennacl/matrix_proxy.hpp
index d7d4f50..8941de2 100644
--- a/viennacl/matrix_proxy.hpp
+++ b/viennacl/matrix_proxy.hpp
@@ -50,114 +50,6 @@ namespace viennacl
       typedef range::difference_type              difference_type;
       typedef value_type                          reference;
       typedef const value_type &                  const_reference;
-<<<<<<< HEAD
-      
-      matrix_range(MatrixType & A, 
-                   range const & row_range,
-                   range const & col_range) : A_(&A), row_range_(row_range), col_range_(col_range) {}
-                   
-      size_type start1() const { return row_range_.start(); }
-      size_type size1() const { return row_range_.size(); }
-
-      size_type start2() const { return col_range_.start(); }
-      size_type size2() const { return col_range_.size(); }
-      
-      ////////// operator= //////////////////////////
-      
-      template <typename MatrixType2>
-      matrix_range<MatrixType> & operator = (const MatrixType2 & other) 
-      {
-        viennacl::linalg::assign(*this, other);
-        return *this;
-      }
-
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator = (const matrix_expression< MatrixType1,
-                                                                      MatrixType2,
-                                                                      op_prod > & proxy) 
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & 
-      operator = (const matrix_expression< MatrixType1,
-                                           MatrixType2,
-                                           op_add > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & 
-      operator = (const matrix_expression< MatrixType1,
-                                           MatrixType2,
-                                           op_sub > & proxy) 
-      {
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-
-      ////////// operator+= //////////////////////////
-
-      matrix_range<MatrixType> & operator += (matrix_range<MatrixType> const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator += (const matrix_expression< MatrixType1,
-                                                                       MatrixType2,
-                                                                       op_prod > & proxy)
-      {
-        MatrixType temp = proxy;
-        viennacl::linalg::inplace_add(*this, temp);
-        return *this;
-      }
-      
-      
-      ////////// operator-= //////////////////////////
-      matrix_range<MatrixType> & operator -= (matrix_range<MatrixType> const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator -= (const matrix_expression< MatrixType1,
-                                                                       MatrixType2,
-                                                                       op_prod > & proxy)
-      {
-        MatrixType temp = proxy;
-        viennacl::linalg::inplace_sub(*this, temp);
-        return *this;
-      }
-
-
-      ////////// operator*= //////////////////////////
-
-      template <typename T>
-      matrix_range<MatrixType> & operator *= (T const & val)
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-      
-      ////////// operator/= //////////////////////////
-
-      template <typename T>
-      matrix_range<MatrixType> & operator /= (T const & val)
-      {
-        viennacl::linalg::inplace_divide(*this, val);
-        return *this;
-      }
-=======
->>>>>>> upstream/1.5.1
 
       matrix_range(MatrixType & A,
                    range const & row_range,
@@ -180,52 +72,6 @@ namespace viennacl
             matrix_range<matrix<SCALARTYPE, row_major, 1> > & gpu_matrix_range )
   {
     assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-<<<<<<< HEAD
-           && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-    
-     if ( gpu_matrix_range.start2() != 0 ||  gpu_matrix_range.size2() !=  gpu_matrix_range.get().size2())
-     {
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
-       
-       //copy each stride separately:
-       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-       {
-         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           entries[j] = cpu_matrix(i,j);
-         
-         std::size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
-         std::size_t num_entries = gpu_matrix_range.size2();
-         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        //std::cout << "Strided copy worked!" << std::endl;
-       }
-     }
-     else
-     {
-       //full block can be copied: 
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-       
-       //copy each stride separately:
-       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           entries[i*gpu_matrix_range.get().internal_size2() + j] = cpu_matrix(i,j);
-       
-       std::size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
-       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
-       //std::cout << "Block copy worked!" << std::endl;
-     }
-=======
            && (cpu_matrix.size2() == gpu_matrix_range.size2())
            && bool("Matrix size mismatch!"));
 
@@ -260,7 +106,6 @@ namespace viennacl
       viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
       //std::cout << "Block copy worked!" << std::endl;
     }
->>>>>>> upstream/1.5.1
   }
 
   //column_major:
@@ -277,21 +122,6 @@ namespace viennacl
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1());
 
        //copy each stride separately:
-<<<<<<< HEAD
-       for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-       {
-         for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-           entries[i] = cpu_matrix(i,j);
-         
-         std::size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
-         std::size_t num_entries = gpu_matrix_range.size1();
-         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-=======
        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
        {
          for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
@@ -300,7 +130,6 @@ namespace viennacl
          vcl_size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.internal_size1() + gpu_matrix_range.start1();
          vcl_size_t num_entries = gpu_matrix_range.size1();
          viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
->>>>>>> upstream/1.5.1
         //std::cout << "Strided copy worked!" << std::endl;
        }
      }
@@ -310,21 +139,6 @@ namespace viennacl
        std::vector<SCALARTYPE> entries(gpu_matrix_range.internal_size1()*gpu_matrix_range.size2());
 
        //copy each stride separately:
-<<<<<<< HEAD
-       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           entries[i + j*gpu_matrix_range.get().internal_size1()] = cpu_matrix(i,j);
-       
-       std::size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
-       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
-=======
        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
          for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
            entries[i + j*gpu_matrix_range.internal_size1()] = cpu_matrix(i,j);
@@ -332,7 +146,6 @@ namespace viennacl
        vcl_size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.internal_size1();
        vcl_size_t num_entries = gpu_matrix_range.internal_size1() * gpu_matrix_range.size2();
        viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
->>>>>>> upstream/1.5.1
        //std::cout << "Block copy worked!" << std::endl;
      }
 
@@ -358,21 +171,6 @@ namespace viennacl
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
 
        //copy each stride separately:
-<<<<<<< HEAD
-       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-       {
-         std::size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
-         std::size_t num_entries = gpu_matrix_range.size2();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        //std::cout << "Strided copy worked!" << std::endl;
-        
-        for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-=======
        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
        {
          vcl_size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.internal_size2() + gpu_matrix_range.start2();
@@ -381,31 +179,11 @@ namespace viennacl
         //std::cout << "Strided copy worked!" << std::endl;
 
         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
->>>>>>> upstream/1.5.1
           cpu_matrix(i,j) = entries[j];
        }
      }
      else
      {
-<<<<<<< HEAD
-       //full block can be copied: 
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-       
-       std::size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
-       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
-       //std::cout << "Block copy worked!" << std::endl;
-
-       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           cpu_matrix(i,j) = entries[i*gpu_matrix_range.get().internal_size2() + j];
-=======
        //full block can be copied:
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.internal_size2());
 
@@ -417,7 +195,6 @@ namespace viennacl
        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
          for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
            cpu_matrix(i,j) = entries[i*gpu_matrix_range.internal_size2() + j];
->>>>>>> upstream/1.5.1
     }
 
   }
@@ -437,21 +214,6 @@ namespace viennacl
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1());
 
        //copy each stride separately:
-<<<<<<< HEAD
-       for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-       {
-         std::size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
-         std::size_t num_entries = gpu_matrix_range.size1();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        //std::cout << "Strided copy worked!" << std::endl;
-        
-        for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-=======
        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
        {
          vcl_size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.internal_size1() + gpu_matrix_range.start1();
@@ -460,7 +222,6 @@ namespace viennacl
         //std::cout << "Strided copy worked!" << std::endl;
 
         for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
->>>>>>> upstream/1.5.1
           cpu_matrix(i,j) = entries[i];
        }
      }
@@ -470,22 +231,6 @@ namespace viennacl
        std::vector<SCALARTYPE> entries(gpu_matrix_range.internal_size1()*gpu_matrix_range.size2());
 
        //copy each stride separately:
-<<<<<<< HEAD
-       std::size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
-       std::size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
-       //std::cout << "Block copy worked!" << std::endl;
-       
-       for (std::size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (std::size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           cpu_matrix(i,j) = entries[i + j*gpu_matrix_range.get().internal_size1()];
-=======
        vcl_size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.internal_size1();
        vcl_size_t num_entries = gpu_matrix_range.internal_size1() * gpu_matrix_range.size2();
        viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
@@ -494,7 +239,6 @@ namespace viennacl
        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
          for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
            cpu_matrix(i,j) = entries[i + j*gpu_matrix_range.internal_size1()];
->>>>>>> upstream/1.5.1
      }
 
   }
@@ -713,398 +457,6 @@ namespace viennacl
     return matrix_slice<MatrixType>(A, r1, r2);
   }
 
-<<<<<<< HEAD
-
-
-
-
-
-//
-//
-//
-/////////////////////////////// Slice /////////////////////////////////////////////
-//
-//
-//
-
-
-
-
-
-
-
-
-
-  template <typename MatrixType>
-  class matrix_slice
-  {
-    public:
-      typedef typename MatrixType::value_type     value_type;
-      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
-      typedef slice::size_type                    size_type;
-      typedef slice::difference_type              difference_type;
-      typedef value_type                          reference;
-      typedef const value_type &                  const_reference;
-      
-      matrix_slice(MatrixType & A, 
-                   slice const & row_slice,
-                   slice const & col_slice) : A_(&A), row_slice_(row_slice), col_slice_(col_slice) {}
-                   
-      size_type start1() const { return row_slice_.start(); }
-      size_type stride1() const { return row_slice_.stride(); }
-      size_type size1() const { return row_slice_.size(); }
-
-      size_type start2() const { return col_slice_.start(); }
-      size_type stride2() const { return col_slice_.stride(); }
-      size_type size2() const { return col_slice_.size(); }
-      
-      ////////// operator= //////////////////////////
-      
-      template <typename MatrixType2>
-      matrix_slice<MatrixType> & operator = (const MatrixType2 & other) 
-      {
-        viennacl::linalg::assign(*this, other);
-        return *this;
-      }
-
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_slice<MatrixType> & operator = (const matrix_expression< MatrixType1,
-                                                                      MatrixType2,
-                                                                      op_prod > & proxy) 
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_slice<MatrixType> & 
-      operator = (const matrix_expression< MatrixType1,
-                                           MatrixType2,
-                                           op_add > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_slice<MatrixType> & 
-      operator = (const matrix_expression< MatrixType1,
-                                           MatrixType2,
-                                           op_sub > & proxy) 
-      {
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-
-      ////////// operator+= //////////////////////////
-
-      matrix_slice<MatrixType> & operator += (matrix_slice<MatrixType> const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_slice<MatrixType> & operator += (const matrix_expression< MatrixType1,
-                                                                       MatrixType2,
-                                                                       op_prod > & proxy)
-      {
-        MatrixType temp = proxy;
-        viennacl::linalg::inplace_add(*this, temp);
-        return *this;
-      }
-      
-      
-      ////////// operator-= //////////////////////////
-      matrix_slice<MatrixType> & operator -= (matrix_slice<MatrixType> const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_slice<MatrixType> & operator -= (const matrix_expression< MatrixType1,
-                                                                       MatrixType2,
-                                                                       op_prod > & proxy)
-      {
-        MatrixType temp = proxy;
-        viennacl::linalg::inplace_sub(*this, temp);
-        return *this;
-      }
-
-
-      ////////// operator*= //////////////////////////
-
-      template <typename T>
-      matrix_slice<MatrixType> & operator *= (T const & val)
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-      
-      ////////// operator/= //////////////////////////
-
-      template <typename T>
-      matrix_slice<MatrixType> & operator /= (T const & val)
-      {
-        viennacl::linalg::inplace_divide(*this, val);
-        return *this;
-      }
-
-      matrix_slice<MatrixType> & operator /= (cpu_value_type val)
-      {
-        viennacl::linalg::inplace_mult(*this, cpu_value_type(1.0) / val);
-        return *this;
-      }
-
-
-      ////////// operator+ //////////////////////////
-      
-      template <typename MatrixType2>
-      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
-                                    matrix_expression< const matrix_slice<MatrixType>,
-                                                       const MatrixType2,
-                                                       op_add > >::type
-      operator + (const MatrixType2 & other) 
-      {
-        return matrix_expression< const matrix_slice<MatrixType>,
-                                  const MatrixType2,
-                                  op_add > (*this, other);
-      }
-      
-      ////////// operator- //////////////////////////
-      
-      template <typename MatrixType2>
-      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
-                                    matrix_expression< const matrix_slice<MatrixType>,
-                                                       const MatrixType2,
-                                                       op_sub > >::type
-      operator - (const MatrixType2 & other) 
-      {
-        return matrix_expression< const matrix_slice<MatrixType>,
-                                  const MatrixType2,
-                                  op_sub > (*this, other);
-      }
-      
-      
-      
-
-      //const_reference operator()(size_type i, size_type j) const { return A_(start1() + i, start2() + i); }
-      //reference operator()(size_type i, size_type j) { return A_(start1() + i, start2() + i); }
-
-      MatrixType & get() { return *A_; }
-      const MatrixType & get() const { return *A_; }
-
-    private:
-      MatrixType * A_;
-      slice row_slice_;
-      slice col_slice_;
-  };
-
-  
-  /** @brief Returns an expression template class representing a transposed matrix */
-  template <typename MatrixType>
-  matrix_expression< const matrix_slice<MatrixType>,
-                     const matrix_slice<MatrixType>,
-                     op_trans> trans(const matrix_slice<MatrixType> & mat)
-  {
-    return matrix_expression< const matrix_slice<MatrixType>,
-                              const matrix_slice<MatrixType>,
-                              op_trans>(mat, mat);
-  }
-  
-  
-  
-  
-  /////////////////////////////////////////////////////////////
-  ///////////////////////// CPU to GPU ////////////////////////
-  /////////////////////////////////////////////////////////////
-  
-  //row_major:
-  template <typename CPU_MATRIX, typename SCALARTYPE>
-  void copy(const CPU_MATRIX & cpu_matrix,
-            matrix_slice<matrix<SCALARTYPE, row_major, 1> > & gpu_matrix_slice )
-  {
-    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
-           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
-    
-     if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
-     {
-       std::size_t num_entries = gpu_matrix_slice.size2() * gpu_matrix_slice.stride2(); //no. of entries per stride
-       
-       std::vector<SCALARTYPE> entries(num_entries);
-       
-       //copy each stride separately:
-       for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
-       {
-         std::size_t start_offset = (gpu_matrix_slice.start1() + i * gpu_matrix_slice.stride1()) * gpu_matrix_slice.get().internal_size2() + gpu_matrix_slice.start2();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_slice.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-         VIENNACL_ERR_CHECK(err);
-         
-         for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
-           entries[j * gpu_matrix_slice.stride2()] = cpu_matrix(i,j);
-         
-         err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                    gpu_matrix_slice.get().handle().get(), CL_TRUE, 
-                                    sizeof(SCALARTYPE)*start_offset,
-                                    sizeof(SCALARTYPE)*num_entries,
-                                    &(entries[0]), 0, NULL, NULL);
-         
-         VIENNACL_ERR_CHECK(err);
-       }
-     }
-  }
-  
-  //column_major:
-  template <typename CPU_MATRIX, typename SCALARTYPE>
-  void copy(const CPU_MATRIX & cpu_matrix,
-            matrix_slice<matrix<SCALARTYPE, column_major, 1> > & gpu_matrix_slice )
-  {
-    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
-           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
-    
-    if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
-    {
-      std::size_t num_entries = gpu_matrix_slice.size1() * gpu_matrix_slice.stride1(); //no. of entries per stride
-      
-      std::vector<SCALARTYPE> entries(num_entries);
-      
-      //copy each column stride separately:
-      for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
-      {
-        std::size_t start_offset = gpu_matrix_slice.start1() + (gpu_matrix_slice.start2() + j * gpu_matrix_slice.stride2()) * gpu_matrix_slice.get().internal_size1();
-        
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        gpu_matrix_slice.get().handle().get(), CL_TRUE, 
-                                        sizeof(SCALARTYPE)*start_offset,
-                                        sizeof(SCALARTYPE)*num_entries,
-                                        &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
-          entries[i * gpu_matrix_slice.stride1()] = cpu_matrix(i,j);
-        
-        err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                   gpu_matrix_slice.get().handle().get(), CL_TRUE, 
-                                   sizeof(SCALARTYPE)*start_offset,
-                                   sizeof(SCALARTYPE)*num_entries,
-                                   &(entries[0]), 0, NULL, NULL);
-        
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-    
-  }
-
-
-  /////////////////////////////////////////////////////////////
-  ///////////////////////// GPU to CPU ////////////////////////
-  /////////////////////////////////////////////////////////////
-  
-  
-  //row_major:
-  template <typename CPU_MATRIX, typename SCALARTYPE>
-  void copy(matrix_slice<matrix<SCALARTYPE, row_major, 1> > const & gpu_matrix_slice,
-            CPU_MATRIX & cpu_matrix)
-  {
-    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
-           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
-    
-     if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
-     {
-       std::size_t num_entries = gpu_matrix_slice.size2() * gpu_matrix_slice.stride2(); //no. of entries per stride
-       
-       std::vector<SCALARTYPE> entries(num_entries);
-       
-       //copy each stride separately:
-       for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
-       {
-         std::size_t start_offset = (gpu_matrix_slice.start1() + i * gpu_matrix_slice.stride1()) * gpu_matrix_slice.get().internal_size2() + gpu_matrix_slice.start2();
-         
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_slice.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-         
-         for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
-           cpu_matrix(i,j) = entries[j * gpu_matrix_slice.stride2()];
-         
-        VIENNACL_ERR_CHECK(err);
-       }
-     }
-    
-  }
-  
-  
-  //column_major:
-  template <typename CPU_MATRIX, typename SCALARTYPE>
-  void copy(matrix_slice<matrix<SCALARTYPE, column_major, 1> > const & gpu_matrix_slice,
-            CPU_MATRIX & cpu_matrix)
-  {
-    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
-           && (cpu_matrix.size2() == gpu_matrix_slice.size2()) );
-    
-    if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
-    {
-      std::size_t num_entries = gpu_matrix_slice.size1() * gpu_matrix_slice.stride1(); //no. of entries per stride
-      
-      std::vector<SCALARTYPE> entries(num_entries);
-      
-      //copy each column stride separately:
-      for (std::size_t j=0; j < gpu_matrix_slice.size2(); ++j)
-      {
-        std::size_t start_offset = gpu_matrix_slice.start1() + (gpu_matrix_slice.start2() + j * gpu_matrix_slice.stride2()) * gpu_matrix_slice.get().internal_size1();
-        
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        gpu_matrix_slice.get().handle().get(), CL_TRUE, 
-                                        sizeof(SCALARTYPE)*start_offset,
-                                        sizeof(SCALARTYPE)*num_entries,
-                                        &(entries[0]), 0, NULL, NULL);
-        
-        for (std::size_t i=0; i < gpu_matrix_slice.size1(); ++i)
-          cpu_matrix(i,j) = entries[i * gpu_matrix_slice.stride1()];
-        
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-    
-  }
-
-
-  template<typename MatrixType>
-  std::ostream & operator<<(std::ostream & s, matrix_slice<MatrixType> const & proxy)
-  {
-    MatrixType temp = proxy;
-    s << temp;
-    return s;
-  }
-
-  template<typename MatrixType>
-  std::ostream & operator<<(std::ostream & s, matrix_slice<const MatrixType> const & proxy)
-  {
-    MatrixType temp = proxy;
-    s << temp;
-    return s;
-  }
-
-
-  //
-  // Convenience function
-  //
-  template <typename MatrixType>
-  matrix_slice<MatrixType> project(MatrixType & A, viennacl::slice const & r1, viennacl::slice const & r2)
-  {
-    return matrix_slice<MatrixType>(A, r1, r2);
-  }
-
-=======
   template <typename MatrixType>
   matrix_slice<MatrixType> project(matrix_range<MatrixType> & A, viennacl::slice const & r1, viennacl::slice const & r2)
   {
@@ -1126,7 +478,6 @@ namespace viennacl
                                     viennacl::slice(A.start2() + r2.start(), A.stride2() * r2.stride(), r2.size())
                                    );
   }
->>>>>>> upstream/1.5.1
 
   // TODO: Allow mix of range/slice
 
diff --git a/viennacl/meta/predicate.hpp b/viennacl/meta/predicate.hpp
index 5efb701..fac514e 100644
--- a/viennacl/meta/predicate.hpp
+++ b/viennacl/meta/predicate.hpp
@@ -1,130 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_META_PREDICATE_HPP_
-#define VIENNACL_META_PREDICATE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file predicate.hpp
-    @brief All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-namespace viennacl
-{
-    //
-    // is_cpu_scalar: checks for float or double
-    //
-    template <typename T>
-    struct is_cpu_scalar
-    {
-      enum { value = false };
-    };
-  
-    template <>
-    struct is_cpu_scalar<float>
-    {
-      enum { value = true };
-    };
-
-    template <>
-    struct is_cpu_scalar<double>
-    {
-      enum { value = true };
-    };
-    
-    //
-    // is_scalar: checks for viennacl::scalar
-    //
-    template <typename T>
-    struct is_scalar
-    {
-      enum { value = false };
-    };
-  
-    template <typename T>
-    struct is_scalar<viennacl::scalar<T> >
-    {
-      enum { value = true };
-    };
-  
-    //
-    // is_vector
-    //
-    template <typename T>
-    struct is_vector
-    {
-      enum { value = false };
-    };
-
-    template <typename ScalarType, unsigned int ALIGNMENT>
-    struct is_vector<viennacl::vector<ScalarType, ALIGNMENT> >
-    {
-      enum { value = true };
-    };
-
-    template <typename T>
-    struct is_vector<viennacl::vector_range<T> >
-    {
-      enum { value = true };
-    };
-    
-    template <typename T>
-    struct is_vector<viennacl::vector_slice<T> >
-    {
-      enum { value = true };
-    };
-    
-    
-    //
-    // is_matrix
-    //
-    template <typename T>
-    struct is_matrix
-    {
-      enum { value = false };
-    };
-
-    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-    struct is_matrix<viennacl::matrix<ScalarType, F, ALIGNMENT> >
-    {
-      enum { value = true };
-    };
-
-    template <typename T>
-    struct is_matrix<viennacl::matrix_range<T> >
-    {
-      enum { value = true };
-    };
-    
-    template <typename T>
-    struct is_matrix<viennacl::matrix_slice<T> >
-    {
-      enum { value = true };
-    };
-
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_META_PREDICATE_HPP_
 #define VIENNACL_META_PREDICATE_HPP_
 
@@ -636,4 +509,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/meta/result_of.hpp b/viennacl/meta/result_of.hpp
index fc07f68..579c5db 100644
--- a/viennacl/meta/result_of.hpp
+++ b/viennacl/meta/result_of.hpp
@@ -1,290 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_META_RESULT_OF_HPP_
-#define VIENNACL_META_RESULT_OF_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/meta/result_of.hpp
-    @brief A collection of compile time type deductions
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-    namespace result_of
-    {
-      //
-      // Retrieve size_type 
-      //
-      template <typename T>
-      struct size_type
-      {
-        typedef typename T::size_type   type;
-      };
-
-      #ifdef VIENNACL_HAVE_EIGEN
-      template <class T, int a, int b, int c, int d, int e>
-      struct size_type< Eigen::Matrix<T, a, b, c, d, e> >
-      {
-        typedef std::size_t   type;
-      };
-      
-      template <>
-      struct size_type<Eigen::VectorXf>
-      {
-        typedef std::size_t   type;
-      };
-      
-      template <>
-      struct size_type<Eigen::VectorXd>
-      {
-        typedef std::size_t   type;
-      };
-
-      template <typename T, int options>
-      struct size_type<Eigen::SparseMatrix<T, options> >
-      {
-        typedef std::size_t   type;
-      };
-      #endif
-      
-      //
-      // Retrieve value_type:
-      //
-      template <typename T>
-      struct value_type
-      {
-        typedef typename T::value_type    type; 
-      };
-
-      //
-      // Retrieve cpu value_type:
-      //
-      template <typename T>
-      struct cpu_value_type
-      {
-        typedef typename T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T    type; 
-      };
-
-      template <>
-      struct cpu_value_type<float>
-      {
-        typedef float    type; 
-      };
-      
-      template <>
-      struct cpu_value_type<double>
-      {
-        typedef double    type; 
-      };
-      
-      template <typename T>
-      struct cpu_value_type<viennacl::scalar<T> >
-      {
-        typedef T    type; 
-      };
-
-      template <typename T, unsigned int ALIGNMENT>
-      struct cpu_value_type<viennacl::vector<T, ALIGNMENT> >
-      {
-        typedef T    type; 
-      };
-
-      template <typename T>
-      struct cpu_value_type<viennacl::vector_range<T> >
-      {
-        typedef typename cpu_value_type<T>::type    type; 
-      };
-
-      template <typename T>
-      struct cpu_value_type<viennacl::vector_slice<T> >
-      {
-        typedef typename cpu_value_type<T>::type    type; 
-      };
-      
-      template <typename T1, typename T2, typename OP>
-      struct cpu_value_type<viennacl::vector_expression<T1, T2, OP> >
-      {
-        typedef typename cpu_value_type<T1>::type    type; 
-      };
-      
-      
-      
-      template <typename T, typename F, unsigned int ALIGNMENT>
-      struct cpu_value_type<viennacl::matrix<T, F, ALIGNMENT> >
-      {
-        typedef T    type; 
-      };
-      
-      template <typename T>
-      struct cpu_value_type<viennacl::matrix_range<T> >
-      {
-        typedef typename cpu_value_type<T>::type    type; 
-      };
-
-      template <typename T>
-      struct cpu_value_type<viennacl::matrix_slice<T> >
-      {
-        typedef typename cpu_value_type<T>::type    type; 
-      };
-      
-      template <typename T1, typename T2, typename OP>
-      struct cpu_value_type<viennacl::matrix_expression<T1, T2, OP> >
-      {
-        typedef typename cpu_value_type<T1>::type    type; 
-      };
-      
-      
-    #ifdef VIENNACL_HAVE_EIGEN  
-      template <>
-      struct value_type<Eigen::MatrixXf>
-      {
-        typedef Eigen::MatrixXf::RealScalar    type; 
-      };
-      
-      template <>
-      struct value_type<Eigen::MatrixXd>
-      {
-        typedef Eigen::MatrixXd::RealScalar    type; 
-      };
-
-      template <typename ScalarType, int option>
-      struct value_type<Eigen::SparseMatrix<ScalarType, option> >
-      {
-        typedef ScalarType    type; 
-      };
-
-      template <>
-      struct value_type<Eigen::VectorXf>
-      {
-        typedef Eigen::VectorXf::RealScalar    type; 
-      };
-
-      template <>
-      struct value_type<Eigen::VectorXd>
-      {
-        typedef Eigen::VectorXd::RealScalar    type; 
-      };
-      
-    #endif
-      
-      
-      
-      template <typename T>
-      struct matrix_expression_internal_storage
-      {
-        typedef T &     type;
-      };
-     
-      template <>
-      struct matrix_expression_internal_storage<const float>
-      {
-        typedef float type;
-      };
-      
-      template <>
-      struct matrix_expression_internal_storage<const double>
-      {
-        typedef double type;
-      };
-      
-      
-      
-      
-      //
-      // Deduce compatible vector type for a matrix type
-      //
-
-      template <typename T>
-      struct vector_for_matrix
-      {
-        typedef typename T::ERROR_CANNOT_DEDUCE_VECTOR_FOR_MATRIX_TYPE   type;
-      };
-
-      //ViennaCL
-      template <typename T, typename F, unsigned int A>
-      struct vector_for_matrix< viennacl::matrix<T, F, A> >
-      {
-        typedef viennacl::vector<T,A>   type;
-      };
-
-      template <typename T, unsigned int A>
-      struct vector_for_matrix< viennacl::compressed_matrix<T, A> >
-      {
-        typedef viennacl::vector<T,A>   type;
-      };
-
-      template <typename T, unsigned int A>
-      struct vector_for_matrix< viennacl::coordinate_matrix<T, A> >
-      {
-        typedef viennacl::vector<T,A>   type;
-      };
-
-      #ifdef VIENNACL_HAVE_UBLAS
-      //Boost:
-      template <typename T, typename F, typename A>
-      struct vector_for_matrix< boost::numeric::ublas::matrix<T, F, A> >
-      {
-        typedef boost::numeric::ublas::vector<T>   type;
-      };
-
-      template <typename T, typename U, std::size_t A, typename B, typename C>
-      struct vector_for_matrix< boost::numeric::ublas::compressed_matrix<T, U, A, B, C> >
-      {
-        typedef boost::numeric::ublas::vector<T>   type;
-      };
-
-      template <typename T, typename U, std::size_t A, typename B, typename C>
-      struct vector_for_matrix< boost::numeric::ublas::coordinate_matrix<T, U, A, B, C> >
-      {
-        typedef boost::numeric::ublas::vector<T>   type;
-      };
-      #endif
-
-      
-      
-    } //namespace result_of
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_META_RESULT_OF_HPP_
 #define VIENNACL_META_RESULT_OF_HPP_
 
@@ -916,4 +629,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/meta/tag_of.hpp b/viennacl/meta/tag_of.hpp
index 3ef40ca..8329e61 100644
--- a/viennacl/meta/tag_of.hpp
+++ b/viennacl/meta/tag_of.hpp
@@ -252,11 +252,7 @@ namespace viennacl
     {
       typedef viennacl::tag_viennacl  type;
     };
-<<<<<<< HEAD
-    
-=======
 
->>>>>>> upstream/1.5.1
     template< typename T, unsigned int I>
     struct tag_of< viennacl::circulant_matrix<T,I> >
     {
diff --git a/viennacl/ocl/backend.hpp b/viennacl/ocl/backend.hpp
index 7157de1..b171e76 100644
--- a/viennacl/ocl/backend.hpp
+++ b/viennacl/ocl/backend.hpp
@@ -160,14 +160,6 @@ namespace viennacl
           contexts_[i].default_device_type(t);
         }
 
-<<<<<<< HEAD
-        /** @brief Sets the context device type */
-        static void set_context_platform_index(long i, std::size_t pf_index)
-        {
-          contexts_[i].platform_index(pf_index);
-        }
-        
-=======
         /** @brief Sets the maximum number of devices per context. Ignored if a device array is provided as well.  */
         static void set_context_device_num(long i, vcl_size_t num)
         {
@@ -180,7 +172,6 @@ namespace viennacl
           contexts_[i].platform_index(pf_index);
         }
 
->>>>>>> upstream/1.5.1
       private:
         static long current_context_id_;
         static std::map<long, bool> initialized_;
@@ -286,19 +277,6 @@ namespace viennacl
       set_context_device_type(i, CL_DEVICE_TYPE_ACCELERATOR);
     }
 
-<<<<<<< HEAD
-    
-    /** @brief Convenience function for setting the platform index
-     * 
-     * @param i         Context ID
-     * @param pf_index  The platform index as returned by clGetPlatformIDs(). This is not the ID of type cl_platform_id!
-     */
-    inline void set_context_platform_index(long i, std::size_t pf_index)
-    {
-      viennacl::ocl::backend<>::set_context_platform_index(i, pf_index);
-    }
-    
-=======
     /** @brief Convenience function for setting the number of default devices per context */
     inline void set_context_device_num(long i, vcl_size_t num)
     {
@@ -316,7 +294,6 @@ namespace viennacl
       viennacl::ocl::backend<>::set_context_platform_index(i, pf_index);
     }
 
->>>>>>> upstream/1.5.1
     ///////////////////////// get queues ///////////////////
     /** @brief Convenience function for getting the default queue for the currently active device in the active context */
     inline viennacl::ocl::command_queue & get_queue()
diff --git a/viennacl/ocl/context.hpp b/viennacl/ocl/context.hpp
index 67fc621..c782adc 100644
--- a/viennacl/ocl/context.hpp
+++ b/viennacl/ocl/context.hpp
@@ -55,18 +55,11 @@ namespace viennacl
       public:
         context() : initialized_(false),
                     device_type_(CL_DEVICE_TYPE_DEFAULT),
-<<<<<<< HEAD
-                    current_device_id(0),
-                    default_device_num_(1),
-                    pf_index_(0) {}
-        
-=======
                     current_device_id_(0),
                     default_device_num_(1),
                     pf_index_(0),
                     current_queue_id_(0) {}
 
->>>>>>> upstream/1.5.1
         //////// Get and set default number of devices per context */
         /** @brief Returns the maximum number of devices to be set up for the context */
         vcl_size_t default_device_num() const { return default_device_num_; }
@@ -367,18 +360,11 @@ namespace viennacl
           //
           cl_program temp = clCreateProgramWithSource(h_.get(), 1, (const char **)&source_text, &source_size, &err);
           VIENNACL_ERR_CHECK(err);
-<<<<<<< HEAD
-          
-          const char * options = build_options_.c_str();
-          err = clBuildProgram(temp.get(), 0, NULL, options, NULL, NULL);
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_BUILD)
-=======
 
           const char * options = build_options_.c_str();
           err = clBuildProgram(temp, 0, NULL, options, NULL, NULL);
           if (err != CL_SUCCESS)
           {
->>>>>>> upstream/1.5.1
             char buffer[8192];
             cl_build_status status;
             clGetProgramBuildInfo(temp, devices_[0].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
@@ -488,32 +474,14 @@ namespace viennacl
 
         /** @brief Returns the context handle */
         const viennacl::ocl::handle<cl_context> & handle() const { return h_; }
-<<<<<<< HEAD
-        
-        /** @brief Returns the current build option string */
-        std::string build_options() const { return build_options_; }
-        
-=======
 
         /** @brief Returns the current build option string */
         std::string build_options() const { return build_options_; }
 
->>>>>>> upstream/1.5.1
         /** @brief Sets the build option string, which is passed to the OpenCL compiler in subsequent compilations. Does not effect programs already compiled previously. */
         void build_options(std::string op) { build_options_ = op; }
 
         /** @brief Returns the platform ID of the platform to be used for the context */
-<<<<<<< HEAD
-        std::size_t platform_index() const  { return pf_index_; }
-
-        /** @brief Sets the platform ID of the platform to be used for the context */
-        void platform_index(std::size_t new_index)
-        {
-          assert(!initialized_ && "Platform ID must be set before context is initialized!");
-          pf_index_ = new_index; 
-        }
-        
-=======
         vcl_size_t platform_index() const  { return pf_index_; }
 
         /** @brief Sets the platform ID of the platform to be used for the context */
@@ -523,7 +491,6 @@ namespace viennacl
           pf_index_ = new_index;
         }
 
->>>>>>> upstream/1.5.1
         /** @brief Less-than comparable for compatibility with std:map  */
         bool operator<(context const & other) const
         {
@@ -553,11 +520,7 @@ namespace viennacl
             #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
             std::cout << "ViennaCL: Setting all devices for context..." << std::endl;
             #endif
-<<<<<<< HEAD
-            
-=======
 
->>>>>>> upstream/1.5.1
             platform pf(pf_index_);
             std::vector<device> devices = pf.devices(device_type_);
             #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
@@ -656,12 +619,8 @@ namespace viennacl
         ProgramContainer programs_;
         std::map< cl_device_id, std::vector< viennacl::ocl::command_queue> > queues_;
         std::string build_options_;
-<<<<<<< HEAD
-        std::size_t pf_index_;
-=======
         vcl_size_t pf_index_;
         vcl_size_t current_queue_id_;
->>>>>>> upstream/1.5.1
     }; //context
 
 
diff --git a/viennacl/ocl/enqueue.hpp b/viennacl/ocl/enqueue.hpp
index 27cdbf8..f2af576 100644
--- a/viennacl/ocl/enqueue.hpp
+++ b/viennacl/ocl/enqueue.hpp
@@ -1,162 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_OCL_ENQUEUE_HPP_
-#define VIENNACL_OCL_ENQUEUE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file enqueue.hpp
-    @brief Enqueues kernels into command queues
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/ocl/command_queue.hpp"
-
-namespace viennacl
-{
-  namespace generator{
-      class custom_operation;
-      void enqueue_custom_op(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue);
-  }
-  
-  namespace ocl
-  {
-
-    /** @brief Enqueues a kernel in the provided queue */
-    template <typename KernelType>
-    void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue)
-    {
-      // 1D kernel:
-      if (k.local_work_size(1) == 0)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl;
-        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size() << "'..." << std::endl;
-        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size() << "'..." << std::endl;
-        #endif
-      
-        size_t tmp_global = k.global_work_size();
-        size_t tmp_local = k.local_work_size();
-        
-        cl_int err;
-        if (tmp_global == 1 && tmp_local == 1)
-          err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL);
-        else
-          err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
-
-        if (err != CL_SUCCESS)  //if not successful, try to start with smaller work size
-        {
-          //std::cout << "FAIL: " << std::endl; exit(0);
-          while (err != CL_SUCCESS && tmp_local > 1)
-          {
-            //std::cout << "Flushing queue, then enqueuing again with half the size..." << std::endl;
-            //std::cout << "Error code: " << err << std::endl;
-            
-            tmp_global /= 2;
-            tmp_local /= 2;
-
-            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-            std::cout << "ViennaCL: Kernel start failed for '" << k.name() << "'." << std::endl;
-            std::cout << "ViennaCL: Global work size: '"  << tmp_global << "'..." << std::endl;
-            std::cout << "ViennaCL: Local work size: '"   << tmp_local << "'..." << std::endl;
-            #endif
-            
-            queue.finish();
-            err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
-          }
-          
-          if (err != CL_SUCCESS)
-          {
-            //could not start kernel with any parameters
-            std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
-            std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl;
-            VIENNACL_ERR_CHECK(err);
-          }
-          else
-          {
-            //remember parameters:
-            k.local_work_size(0, tmp_local);
-            k.global_work_size(0, tmp_global);
-            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-            std::cout << "ViennaCL: Kernel '" << k.name() << "' now uses global work size " << tmp_global << " and local work size " << tmp_local << "."  << std::endl;
-            #endif
-          }          
-        }
-      }
-      else //2D kernel
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Starting 2D-kernel '" << k.name() << "'..." << std::endl;
-        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size(0) << ", " << k.global_work_size(1) << "'..." << std::endl;
-        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size(0) << ", " << k.local_work_size(1) << "'..." << std::endl;
-        #endif
-
-        size_t tmp_global[2]; 
-        tmp_global[0] = k.global_work_size(0);
-        tmp_global[1] = k.global_work_size(1);
-        
-        size_t tmp_local[2];
-        tmp_local[0] = k.local_work_size(0);
-        tmp_local[1] = k.local_work_size(1);
-        
-        cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 2, NULL, tmp_global, tmp_local, 0, NULL, NULL);
-
-        if (err != CL_SUCCESS)
-        {
-          //could not start kernel with any parameters
-          std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
-          VIENNACL_ERR_CHECK(err);
-        }
-        
-      }
-            
-      #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-      queue.finish();
-      std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl;
-      #endif
-    } //enqueue()
-    
-    
-    /** @brief Convenience function that enqueues the provided kernel into the first queue of the currently active device in the currently active context */
-    template <typename KernelType>
-    void enqueue(KernelType & k)
-    {
-      enqueue(k, viennacl::ocl::current_context().get_queue());
-    }
-    
-    inline void enqueue(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue)
-    {
-      generator::enqueue_custom_op(op,queue);
-    }
-
-    inline void enqueue(viennacl::generator::custom_operation & op)
-    {
-      enqueue(op, viennacl::ocl::current_context().get_queue());
-    }
-    
-  } // namespace ocl
-} // namespace viennacl
-#endif
-=======
 #ifndef VIENNACL_OCL_ENQUEUE_HPP_
 #define VIENNACL_OCL_ENQUEUE_HPP_
 
@@ -286,4 +127,3 @@ namespace viennacl
   } // namespace ocl
 } // namespace viennacl
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/ocl/kernel.hpp b/viennacl/ocl/kernel.hpp
index 4ed012a..5b98b97 100644
--- a/viennacl/ocl/kernel.hpp
+++ b/viennacl/ocl/kernel.hpp
@@ -1,756 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_OCL_KERNEL_HPP_
-#define VIENNACL_OCL_KERNEL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file kernel.hpp
-    @brief Representation of an OpenCL kernel in ViennaCL.
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include "viennacl/ocl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/program.hpp"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/local_mem.hpp"
-
-namespace viennacl
-{
-  namespace ocl
-  {
-    
-    /** @brief Represents an OpenCL kernel within ViennaCL */
-    class kernel
-    {
-      template <typename KernelType>
-      friend void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
-      
-      
-    public:
-      typedef std::size_t            size_type;
-      
-      kernel() : handle_(0)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Creating kernel object (default CTOR)" << std::endl;
-        #endif
-        set_work_size_defaults();
-      }
-      
-      kernel(viennacl::ocl::handle<cl_program> const & prog, std::string const & name) 
-       : handle_(0), program_(prog), name_(name), init_done_(false)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Creating kernel object (full CTOR)" << std::endl;
-        #endif
-        set_work_size_defaults();
-      }
-      
-      kernel(kernel const & other) 
-       : handle_(other.handle_), program_(other.program_), name_(other.name_), init_done_(other.init_done_)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Creating kernel object (Copy CTOR)" << std::endl;
-        #endif
-        local_work_size_[0] = other.local_work_size_[0];
-        local_work_size_[1] = other.local_work_size_[1];
-        
-        global_work_size_[0] = other.global_work_size_[0];
-        global_work_size_[1] = other.global_work_size_[1];
-      }
-      
-      viennacl::ocl::kernel & operator=(const kernel & other)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Assigning kernel object" << std::endl;
-        #endif
-        handle_ = other.handle_;
-        program_ = other.program_;
-        name_ = other.name_;
-        init_done_ = other.init_done_;
-        local_work_size_[0] = other.local_work_size_[0];
-        local_work_size_[1] = other.local_work_size_[1];
-        global_work_size_[0] = other.global_work_size_[0];
-        global_work_size_[1] = other.global_work_size_[1];
-        return *this;
-      }
-      
-      
-      /** @brief Sets an unsigned integer argument at the provided position */
-      void arg(unsigned int pos, cl_uint val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting unsigned long kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uint), (void*)&val);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      /** @brief Sets a single precision floating point argument at the provided position */
-      void arg(unsigned int pos, float val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting floating point kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(float), (void*)&val);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      /** @brief Sets a double precision floating point argument at the provided position */
-      void arg(unsigned int pos, double val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting double precision kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(double), (void*)&val);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      //generic handling: call .handle() member
-      /** @brief Sets an OpenCL memory object at the provided position */
-      template<class VCL_TYPE>
-      void arg(unsigned int pos, VCL_TYPE const & val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting generic kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_mem temp = val.handle().get();
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      //forward handles directly:
-      /** @brief Sets an OpenCL object at the provided position */
-      template<class CL_TYPE>
-      void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
-      {
-        //arg(pos, h);
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting handle kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        CL_TYPE temp = h.get();
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      
-      //local buffer argument:
-      /** @brief Sets an OpenCL local memory object at the provided position */
-      void arg(unsigned int pos, const local_mem & mem)
-      {
-        unsigned int size =  mem.size();
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting local memory kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, size, 0);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      
-      
-      /** @brief Convenience function for setting one kernel parameter */
-      template <typename T0>
-      kernel & operator()(T0 const & t0)
-      {
-         arg(0, t0);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting two kernel parameters */
-      template <typename T0, typename T1>
-      kernel & operator()(T0 const & t0, T1 const & t1)
-      {
-         arg(0, t0); arg(1, t1);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting three kernel parameters */
-      template <typename T0, typename T1, typename T2>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting four kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting five kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting six kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting seven kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting eight kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting nine kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting ten kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4,
-                typename T5, typename T6, typename T7, typename T8, typename T9>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4,
-                          T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting eleven kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twelve kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting thirteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11, typename T12>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11, T12 const & t12)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11); arg(12, t12);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting fourteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting fifteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting sixteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting seventeen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting eighteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting nineteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twenty kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twentyone kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twentytwo kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 23 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);  arg(22, t22);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 24 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 25 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 26 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 27 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25, typename T26>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25, T26 const & t26 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25); arg(26, t26);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 28 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25, typename T26, typename T27>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 29 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25, typename T26, typename T27, typename T28>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 30 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 31 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
-                typename T30>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
-                          T30 const & t30 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
-         arg(30, t30);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting 32 kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
-                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
-                typename T30, typename T31>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
-                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
-                          T30 const & t30, T31 const & t31 
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
-         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
-         arg(30, t30); arg(31, t31);
-         return *this;
-      }     
-
-
-
-
-      /** @brief Returns the local work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      */
-      size_type local_work_size(int index = 0) const
-      {
-        assert(index == 0 || index == 1);
-        return local_work_size_[index];
-      }
-      /** @brief Returns the global work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      */
-      size_type global_work_size(int index = 0) const
-      { 
-        assert(index == 0 || index == 1);
-        return global_work_size_[index];
-      }
-
-      /** @brief Sets the local work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      * @param s       The new local work size
-      */
-      void local_work_size(int index, size_type s)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting local work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
-        #endif
-        assert(index == 0 || index == 1);
-        local_work_size_[index] = s;
-      }
-      /** @brief Sets the global work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      * @param s       The new global work size
-      */
-      void global_work_size(int index, size_type s)
-      { 
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting global work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
-        #endif
-        assert(index == 0 || index == 1);
-        global_work_size_[index] = s;
-      }
-
-      std::string const & name() const { return name_; }
-
-      viennacl::ocl::handle<cl_kernel> const & handle() const { return handle_; }
-
-
-    private:
-      void create_kernel()
-      {
-        cl_int err;
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Building kernel " << name_ << std::endl;
-        #endif
-        handle_ = clCreateKernel(program_.get(), name_.c_str(), &err);
-        
-        if (err != CL_SUCCESS)
-        {
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-          std::cout << "ViennaCL: Could not create kernel '" << name_ << "'." << std::endl;
-          #endif
-          //std::cerr << "Could not build kernel '" << name_ << "'." << std::endl;
-        }
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      void set_work_size_defaults()
-      {
-        if (viennacl::ocl::current_device().type() == CL_DEVICE_TYPE_GPU)
-        {
-          local_work_size_[0] = 128; local_work_size_[1] = 0;
-          global_work_size_[0] = 128*128; global_work_size_[1] = 0;
-        }
-        else //assume CPU type:
-        {
-          //conservative assumption: one thread per CPU core:
-          local_work_size_[0] = 1; local_work_size_[1] = 0;
-          global_work_size_[0] = viennacl::ocl::current_device().max_compute_units(); global_work_size_[1] = 0;
-        }
-      }
-
-      void init()
-      {
-        if (!init_done_)
-        {
-          create_kernel();
-          init_done_ = true;
-        }
-      }
-      
-      viennacl::ocl::handle<cl_kernel> handle_;
-      viennacl::ocl::handle<cl_program> program_;
-      std::string name_;
-      bool init_done_;
-      size_type local_work_size_[2];
-      size_type global_work_size_[2];
-    };
-    
-  } //namespace ocl
-} //namespace viennacl
-
-#endif
-=======
 #ifndef VIENNACL_OCL_KERNEL_HPP_
 #define VIENNACL_OCL_KERNEL_HPP_
 
@@ -1587,4 +834,3 @@ namespace viennacl
 } //namespace viennacl
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/ocl/platform.hpp b/viennacl/ocl/platform.hpp
index 2ea5576..11a4708 100644
--- a/viennacl/ocl/platform.hpp
+++ b/viennacl/ocl/platform.hpp
@@ -46,11 +46,7 @@ namespace viennacl
     {
 
       public:
-<<<<<<< HEAD
-        platform(std::size_t pf_index = 0)
-=======
         platform(vcl_size_t pf_index = 0)
->>>>>>> upstream/1.5.1
         {
           cl_int err;
           cl_uint num_platforms;
@@ -60,11 +56,6 @@ namespace viennacl
           #endif
           err = clGetPlatformIDs(42, ids, &num_platforms);
           VIENNACL_ERR_CHECK(err);
-<<<<<<< HEAD
-          assert(num_platforms > pf_index && "ViennaCL: ERROR: Not enough platforms found!");          
-          id_ = ids[pf_index];
-          assert(num_platforms > 0 && "ViennaCL: ERROR: No platform found!");          
-=======
           assert(num_platforms > pf_index && bool("ViennaCL: ERROR: Not enough platforms found!"));
           id_ = ids[pf_index];
           assert(num_platforms > 0 && bool("ViennaCL: ERROR: No platform found!"));
@@ -77,7 +68,6 @@ namespace viennacl
         void operator=(cl_platform_id pf_id)
         {
           id_ = pf_id;
->>>>>>> upstream/1.5.1
         }
 
         cl_platform_id id() const
diff --git a/viennacl/slice.hpp b/viennacl/slice.hpp
index 094eec8..3218745 100644
--- a/viennacl/slice.hpp
+++ b/viennacl/slice.hpp
@@ -2,27 +2,17 @@
 #define VIENNACL_SLICE_HPP_
 
 /* =========================================================================
-<<<<<<< HEAD
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-=======
    Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
    Portions of this software are copyright by UChicago Argonne, LLC.
->>>>>>> upstream/1.5.1
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-<<<<<<< HEAD
-               
-=======
 
->>>>>>> upstream/1.5.1
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -41,11 +31,7 @@ namespace viennacl
 {
 
   /** @brief A slice class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
-<<<<<<< HEAD
-   * 
-=======
    *
->>>>>>> upstream/1.5.1
    * Similar to the boost::numeric::ublas::basic_range class.
    */
   template <typename SizeType /* see forwards.h for default argument*/,
@@ -58,24 +44,11 @@ namespace viennacl
       typedef size_type            value_type;
       typedef value_type           const_reference;
       typedef const_reference      reference;
-<<<<<<< HEAD
-      
-=======
 
->>>>>>> upstream/1.5.1
       basic_slice() : start_(0), stride_(1), size_(0) {}
       basic_slice(size_type start_index,
                   difference_type stride_arg,
                   size_type size_arg) : start_(start_index), stride_(stride_arg), size_(size_arg) {}
-<<<<<<< HEAD
-        
-        
-      size_type       start() const { return start_; }
-      difference_type stride() const { return stride_; }
-      size_type       size() const { return size_; }
-      
-      const_reference operator()(size_type i) const 
-=======
 
 
       size_type       start() const { return start_; }
@@ -83,38 +56,22 @@ namespace viennacl
       size_type       size() const { return size_; }
 
       const_reference operator()(size_type i) const
->>>>>>> upstream/1.5.1
       {
         assert(i < size());
         return start_ + i * stride_;
       }
       const_reference operator[](size_type i) const { return operator()(i); }
-<<<<<<< HEAD
-      
-      bool operator==(const basic_slice & s) const { return (start_ == s.start_) && (stride_ == s.stride_) && (size_ == s.size_); }
-      bool operator!=(const basic_slice & s) const { return !(*this == s); }
-      
-=======
 
       bool operator==(const basic_slice & s) const { return (start_ == s.start_) && (stride_ == s.stride_) && (size_ == s.size_); }
       bool operator!=(const basic_slice & s) const { return !(*this == s); }
 
->>>>>>> upstream/1.5.1
     private:
       size_type start_;
       difference_type stride_;
       size_type size_;
   };
-<<<<<<< HEAD
-  
-  
-}
-
-#endif
-=======
 
 
 }
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/toeplitz_matrix.hpp b/viennacl/toeplitz_matrix.hpp
index e48c511..af7adc5 100644
--- a/viennacl/toeplitz_matrix.hpp
+++ b/viennacl/toeplitz_matrix.hpp
@@ -268,11 +268,7 @@ namespace viennacl {
 
         for(vcl_size_t i = 0; i < size; i++) {
             s << "(";
-<<<<<<< HEAD
-            for(std::size_t j = 0; j < size; j++) {
-=======
             for(vcl_size_t j = 0; j < size; j++) {
->>>>>>> upstream/1.5.1
                 s << tmp[static_cast<int>(j) - static_cast<int>(i) + static_cast<int>(size - 1)];
                 //s << (int)i - (int)j;
                 if(j < (size - 1)) s << ",";
diff --git a/viennacl/tools/adapter.hpp b/viennacl/tools/adapter.hpp
index 8e2947e..0b753ea 100644
--- a/viennacl/tools/adapter.hpp
+++ b/viennacl/tools/adapter.hpp
@@ -139,17 +139,10 @@ namespace viennacl
         }
 
         bool operator!=(self_type const & other) const { return !(*this == other); }
-<<<<<<< HEAD
-        
-        size_type index1() const { return i_; }
-        size_type index2() const
-        { 
-=======
 
         size_type index1() const { return i_; }
         size_type index2() const
         {
->>>>>>> upstream/1.5.1
           if (is_iterator1)
             return 0;
           else
@@ -313,17 +306,10 @@ namespace viennacl
           return (iter2 == other.iter2);
         }
         bool operator!=(self_type const & other) const { return !(*this == other); }
-<<<<<<< HEAD
-        
-        size_type index1() const { return i_; }
-        size_type index2() const
-        { 
-=======
 
         size_type index1() const { return i_; }
         size_type index2() const
         {
->>>>>>> upstream/1.5.1
           if (is_iterator1)
             return 0;
           else
diff --git a/viennacl/tools/matrix_size_deducer.hpp b/viennacl/tools/matrix_size_deducer.hpp
index 7e011a2..b15dc1b 100644
--- a/viennacl/tools/matrix_size_deducer.hpp
+++ b/viennacl/tools/matrix_size_deducer.hpp
@@ -168,35 +168,11 @@ namespace viennacl
                                viennacl::matrix_base<ScalarType, F2> const & rhs) { return rhs.size2(); }
     };
 
-<<<<<<< HEAD
-    template <typename T1, typename MatrixType2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
-                                                                 T1, op_trans>,
-                               const viennacl::matrix_slice<MatrixType2>,
-                               viennacl::op_prod>
-    {
-      static std::size_t size1(viennacl::matrix_expression<T1,
-                                                           T1,
-                                                           op_trans> const & lhs,
-                               viennacl::matrix_slice<MatrixType2> const & rhs) { return lhs.lhs().size2(); }
-      static std::size_t size2(viennacl::matrix_expression<T1,
-                                                           T1,
-                                                           op_trans> const & lhs,
-                               viennacl::matrix_slice<MatrixType2> const & rhs) { return rhs.size2(); }
-    };
-    
-    
-    // A * B^T 
-    
-    template <typename ScalarType, typename F1, unsigned int A1, typename T2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F1, A1>,
-=======
 
     // A * B^T
 
     template <typename ScalarType, typename F1, typename T2>
     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<ScalarType, F1>,
->>>>>>> upstream/1.5.1
                                const viennacl::matrix_expression<T2,
                                                                  T2, op_trans>,
                                viennacl::op_mat_mat_prod>
@@ -231,27 +207,7 @@ namespace viennacl
       static vcl_size_t size2(LHSType const & /*lhs*/,
                                RHSType const & rhs) { return rhs.lhs().size1(); }
     };
-<<<<<<< HEAD
-
-    template <typename MatrixType1, typename T2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_slice<MatrixType1>,
-                               const viennacl::matrix_expression<T2,
-                                                                 T2, op_trans>,
-                               viennacl::op_prod>
-    {
-      static std::size_t size1(viennacl::matrix_slice<MatrixType1> const & lhs,
-                               viennacl::matrix_expression<T2,
-                                                           T2,
-                                                           op_trans> const & rhs) { return lhs.size1(); }
-      static std::size_t size2(viennacl::matrix_slice<MatrixType1> const & lhs,
-                               viennacl::matrix_expression<T2,
-                                                           T2,
-                                                           op_trans> const & rhs) { return rhs.lhs().size1(); }
-    };
-    
-=======
     /** \endcond */
->>>>>>> upstream/1.5.1
   }
 }
 
diff --git a/viennacl/tools/tools.hpp b/viennacl/tools/tools.hpp
index 5044ead..028d3aa 100644
--- a/viennacl/tools/tools.hpp
+++ b/viennacl/tools/tools.hpp
@@ -1,414 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_TOOLS_TOOLS_HPP_
-#define VIENNACL_TOOLS_TOOLS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file tools.hpp
-    @brief Various little tools used here and there in ViennaCL.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/tools/adapter.hpp"
-
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    
-    /** @brief Supply suitable increment functions for the iterators: */
-    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    struct MATRIX_ITERATOR_INCREMENTER<viennacl::row_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-    {
-      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, unsigned int & row, unsigned int & col) { ++row; }
-    };
-
-    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    struct MATRIX_ITERATOR_INCREMENTER<viennacl::col_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-    {
-      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, unsigned int & row, unsigned int & col) { ++col; }
-    };
-
-    
-    /** @brief A guard that checks whether the floating point type of GPU types is either float or double */
-    template <typename T>
-    struct CHECK_SCALAR_TEMPLATE_ARGUMENT
-    {
-        typedef typename T::ERROR_SCALAR_MUST_HAVE_TEMPLATE_ARGUMENT_FLOAT_OR_DOUBLE  ResultType;
-    };
-    
-    template <>
-    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<float>
-    {
-        typedef float  ResultType;
-    };
-    
-    template <>
-    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<double>
-    {
-        typedef double  ResultType;
-    };
-
-    
-    
-    /** @brief Reads a text from a file into a std::string
-    *
-    * @param filename   The filename
-    * @return The text read from the file
-    */
-    inline std::string readTextFromFile(const std::string & filename)
-    {
-      std::ifstream f(filename.c_str());
-      if (!f) return std::string();
-
-      std::stringstream result;
-      std::string tmp;
-      while (std::getline(f, tmp))
-        result << tmp << std::endl;
-
-      return result.str();
-    }
-
-    /** @brief Replaces all occurances of a substring by another stringstream
-    *
-    * @param text   The string to search in
-    * @param to_search  The substring to search for
-    * @param to_replace The replacement for found substrings
-    * @return The resulting string
-    */
-    inline std::string strReplace(const std::string & text, std::string to_search, std::string to_replace)
-    {
-      std::string::size_type pos = 0;
-      std::string result;
-      std::string::size_type found;
-      while( (found = text.find(to_search, pos)) != std::string::npos )
-      {
-        result.append(text.substr(pos,found-pos));
-        result.append(to_replace);
-        pos = found + to_search.length();
-      }
-      if (pos < text.length())
-        result.append(text.substr(pos));
-      return result;
-    }
-
-    /** @brief Rounds an integer to the next multiple of another integer
-    *
-    * @tparam INT_TYPE  The integer type
-    * @param to_reach   The integer to be rounded up (ceil operation)
-    * @param base       The base
-    * @return The smallest multiple of 'base' such that to_reach <= base
-    */
-    template <class INT_TYPE>
-    INT_TYPE roundUpToNextMultiple(INT_TYPE to_reach, INT_TYPE base)
-    {
-      if (to_reach % base == 0) return to_reach;
-      return ((to_reach / base) + 1) * base;
-    }
-    
-    
-    /** @brief Create a double precision kernel out of a single precision kernel
-    *
-    * @param source          The source string
-    * @param fp_extension    An info string that specifies the OpenCL double precision extension
-    * @return   The double precision kernel
-    */
-    inline std::string make_double_kernel(std::string const & source, std::string const & fp_extension)
-    {
-      std::stringstream ss;
-      ss << "#pragma OPENCL EXTENSION " << fp_extension << " : enable\n\n";
-      
-      std::string result = ss.str();
-      result.append(strReplace(source, "float", "double"));
-      return result;
-    }
-    
-    
-    /** @brief Removes the const qualifier from a type */
-    template <typename T>
-    struct CONST_REMOVER
-    {
-      typedef T   ResultType;
-    };
-
-    template <typename T>
-    struct CONST_REMOVER<const T>
-    {
-      typedef T   ResultType;
-    };
-
-
-    /** @brief Extracts the vector type from one of the two arguments. Used for the vector_expression type.
-    *
-    * @tparam LHS   The left hand side operand of the vector_expression
-    * @tparam RHS   The right hand side operand of the vector_expression
-    */
-    template <typename LHS, typename RHS>
-    struct VECTOR_EXTRACTOR_IMPL
-    {
-      typedef typename LHS::ERROR_COULD_NOT_EXTRACT_VECTOR_INFORMATION_FROM_VECTOR_EXPRESSION  ResultType;
-    };
-    
-    template <typename LHS, typename ScalarType, unsigned int A>
-    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector<ScalarType, A> >
-    {
-      typedef viennacl::vector<ScalarType, A>   ResultType;
-    };
-
-    template <typename LHS, typename VectorType>
-    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector_range<VectorType> >
-    {
-      typedef VectorType   ResultType;
-    };
-
-    template <typename LHS, typename VectorType>
-    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector_slice<VectorType> >
-    {
-      typedef VectorType   ResultType;
-    };
-
-    
-    template <typename RHS, typename ScalarType, unsigned int A>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector<ScalarType, A>, RHS>
-    {
-      typedef viennacl::vector<ScalarType, A>   ResultType;
-    };
-
-    template <typename VectorType, typename RHS>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_range<VectorType>, RHS>
-    {
-      typedef VectorType   ResultType;
-    };
-
-    template <typename VectorType, typename RHS>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_slice<VectorType>, RHS>
-    {
-      typedef VectorType   ResultType;
-    };
-    
-    //resolve ambiguities for previous cases:
-    template <typename ScalarType, unsigned int A>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector<ScalarType, A>, viennacl::vector<ScalarType, A> >
-    {
-      typedef viennacl::vector<ScalarType, A>   ResultType;
-    };
-
-    template <typename VectorType>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_range<VectorType>, viennacl::vector_range<VectorType> >
-    {
-      typedef VectorType   ResultType;
-    };
-    
-    template <typename VectorType>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector_slice<VectorType>, viennacl::vector_slice<VectorType> >
-    {
-      typedef VectorType   ResultType;
-    };
-    
-    
-    template <typename LHS, typename RHS>
-    struct VECTOR_EXTRACTOR
-    {
-      typedef typename VECTOR_EXTRACTOR_IMPL<typename CONST_REMOVER<LHS>::ResultType,
-                                              typename CONST_REMOVER<RHS>::ResultType>::ResultType      ResultType;
-    };
-
-    /** @brief Deduces the size of the resulting vector represented by a vector_expression from the operands
-    *
-    * @tparam LHS   The left hand side operand
-    * @tparam RHS   The right hand side operand
-    * @tparam OP    The operation tag
-    */
-    template <typename LHS, typename RHS, typename OP>
-    struct VECTOR_SIZE_DEDUCER
-    {
-      //take care: using a plain, naive .size() on the left hand side type can cause subtle side-effects!
-    };
-
-    
-    template <typename ScalarType, unsigned int A, typename RHS>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_add>
-    {
-      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
-                         const RHS & rhs) { return lhs.size(); }
-    };
-
-    template <typename ScalarType, unsigned int A, typename RHS>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_sub>
-    {
-      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
-                         const RHS & rhs) { return lhs.size(); }
-    };
-    
-    
-   
-    //Standard case: LHS is the vector type and carries the correct size
-    template <typename ScalarType, unsigned int A, typename RHS>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
-                         const RHS & rhs) { return lhs.size(); }
-    };
-
-    template <typename ScalarType, unsigned int A, typename RHS>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_div>
-    {
-      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
-                         const RHS & rhs) { return lhs.size(); }
-    };
-    
-    //special case: matrix-vector product: Return the number of rows of the matrix
-    template <typename ScalarType, typename F, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::matrix<ScalarType, F, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::circulant_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::circulant_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-    
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::compressed_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::compressed_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::coordinate_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::coordinate_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::ell_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::ell_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::hyb_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::hyb_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-    
-    
-    //special case: transposed matrix-vector product: Return the number of cols(!) of the matrix
-    template <typename ScalarType, typename F, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::matrix_expression< const viennacl::matrix<ScalarType, F, Amat>,
-                                                                  const viennacl::matrix<ScalarType, F, Amat>,
-                                                                  op_trans>,
-                               const viennacl::vector<ScalarType, A>,
-                               viennacl::op_prod>
-    {
-      static size_t size(const viennacl::matrix_expression< const viennacl::matrix<ScalarType, F, Amat>,
-                                                            const viennacl::matrix<ScalarType, F, Amat>,
-                                                            op_trans> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.lhs().size2(); }
-    };
-
-    
-    
-    
-    
-    /** @brief Obtain the cpu scalar type from a type, including a GPU type like viennacl::scalar<T>
-    *
-    * @tparam T   Either a CPU scalar type or a GPU scalar type
-    */
-    template <typename T>
-    struct CPU_SCALAR_TYPE_DEDUCER
-    {
-      //force compiler error if type cannot be deduced
-      //typedef T       ResultType;
-    };
-
-    template <>
-    struct CPU_SCALAR_TYPE_DEDUCER< float >
-    {
-      typedef float       ResultType;
-    };
-
-    template <>
-    struct CPU_SCALAR_TYPE_DEDUCER< double >
-    {
-      typedef double       ResultType;
-    };
-    
-    template <typename T>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::scalar<T> >
-    {
-      typedef T       ResultType;
-    };
-
-    template <typename T, unsigned int A>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::vector<T, A> >
-    {
-      typedef T       ResultType;
-    };
-
-    template <typename T, typename F, unsigned int A>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix<T, F, A> >
-    {
-      typedef T       ResultType;
-    };
-
-    
-    template <typename T, typename F, unsigned int A>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix_expression<const matrix<T, F, A>, const matrix<T, F, A>, op_trans> >
-    {
-      typedef T       ResultType;
-    };
-
-        
-  } //namespace tools
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_TOOLS_TOOLS_HPP_
 #define VIENNACL_TOOLS_TOOLS_HPP_
 
@@ -698,4 +287,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/traits/handle.hpp b/viennacl/traits/handle.hpp
index fe6a622..fa196cc 100644
--- a/viennacl/traits/handle.hpp
+++ b/viennacl/traits/handle.hpp
@@ -1,109 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_TRAITS_HANDLE_HPP_
-#define VIENNACL_TRAITS_HANDLE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file traits/handle.hpp
-    @brief Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-namespace viennacl
-{
-  namespace traits
-  {
-    
-    // Returns the OpenCL handle of a ViennaCL object
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(T & obj)
-    {
-      return obj.handle();
-    }
-
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_range<T> & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-    
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_range<T> const & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_slice<T> & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_slice<T> const & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_range<T> & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_range<T> const & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_slice<T> & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_slice<T> const & obj)
-    {
-      return viennacl::ocl::handle<cl_mem>(obj.get().handle());
-    }
-
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_TRAITS_HANDLE_HPP_
 #define VIENNACL_TRAITS_HANDLE_HPP_
 
@@ -349,4 +243,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/traits/size.hpp b/viennacl/traits/size.hpp
index f95ba78..4c8bd08 100644
--- a/viennacl/traits/size.hpp
+++ b/viennacl/traits/size.hpp
@@ -1,246 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_TRAITS_SIZE_HPP_
-#define VIENNACL_TRAITS_SIZE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file size.hpp
-    @brief Generic size and resize functionality for different vector and matrix types
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/meta/result_of.hpp"
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-
-  namespace traits
-  {
-    //
-    // Resize: Change the size of vectors and matrices
-    //
-    template <typename MatrixType>
-    void resize(MatrixType & matrix, size_t rows, size_t cols)
-    {
-      matrix.resize(rows, cols); 
-    }
-    
-    template <typename VectorType>
-    void resize(VectorType & vec, size_t new_size)
-    {
-      vec.resize(new_size); 
-    }
-    
-    #ifdef VIENNACL_HAVE_UBLAS  
-    //ublas needs separate treatment:
-    template <typename ScalarType>
-    void resize(boost::numeric::ublas::compressed_matrix<ScalarType> & matrix,
-                size_t rows,
-                size_t cols)
-    {
-      matrix.resize(rows, cols, false); //Note: omitting third parameter leads to compile time error (not implemented in ublas <= 1.42) 
-    }
-    #endif  
-    
-    
-    #ifdef VIENNACL_HAVE_MTL4
-    template <typename ScalarType>
-    void resize(mtl::compressed2D<ScalarType> & matrix,
-                size_t rows,
-                size_t cols)
-    {
-      matrix.change_dim(rows, cols);
-    }
-    
-    template <typename ScalarType>
-    void resize(mtl::dense_vector<ScalarType> & vec,
-                size_t new_size)
-    {
-      vec.change_dim(new_size);
-    }
-    #endif      
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline void resize(Eigen::MatrixXf & m,
-                       std::size_t new_rows,
-                       std::size_t new_cols)
-    {
-      m.resize(new_rows, new_cols);
-    }
-    
-    inline void resize(Eigen::MatrixXd & m,
-                       std::size_t new_rows,
-                       std::size_t new_cols)
-    {
-      m.resize(new_rows, new_cols);
-    }
-    
-    template <typename T, int options>
-    inline void resize(Eigen::SparseMatrix<T, options> & m,
-                       std::size_t new_rows,
-                       std::size_t new_cols)
-    {
-      m.resize(new_rows, new_cols);
-    }    
-    
-    inline void resize(Eigen::VectorXf & v,
-                       std::size_t new_size)
-    {
-      v.resize(new_size);
-    }
-    
-    inline void resize(Eigen::VectorXd & v,
-                       std::size_t new_size)
-    {
-      v.resize(new_size);
-    }
-    #endif
-
-
-    //
-    // size: Returns the length of vectors
-    //
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type size(VectorType const & vec)
-    {
-      return vec.size(); 
-    }
-
-    #ifdef VIENNACL_HAVE_MTL4
-    template <typename ScalarType>
-    typename result_of::size_type< mtl::dense_vector<ScalarType> >::type
-    size(mtl::dense_vector<ScalarType> const & vec) { return vec.used_memory(); }
-    #endif
-    
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline std::size_t size(Eigen::VectorXf const & v) { return v.rows(); }
-    inline std::size_t size(Eigen::VectorXd const & v) { return v.rows(); }
-    #endif
-
-    //
-    // size1: No. of rows for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    size1(MatrixType const & mat) { return mat.size1(); }
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline std::size_t size1(Eigen::MatrixXf const & m) { return m.rows(); }
-    inline std::size_t size1(Eigen::MatrixXd const & m) { return m.rows(); }
-    template <typename T, int options>
-    inline std::size_t size1(Eigen::SparseMatrix<T, options> & m) { return m.rows(); }    
-    #endif
-
-    //
-    // size2: No. of columns for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    size2(MatrixType const & mat) { return mat.size2(); }
- 
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline std::size_t size2(Eigen::MatrixXf const & m) { return m.cols(); }
-    inline std::size_t size2(Eigen::MatrixXd const & m) { return m.cols(); }
-    template <typename T, int options>
-    inline std::size_t size2(Eigen::SparseMatrix<T, options> & m) { return m.cols(); }    
-    #endif
- 
-    //
-    // internal_size: Returns the internal (padded) length of vectors
-    //
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type 
-    internal_size(VectorType const & vec)
-    {
-      return vec.internal_size(); 
-    }
-
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type 
-    internal_size(viennacl::vector_range<VectorType> const & vec)
-    {
-      return vec.get().internal_size(); 
-    }
-    
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type 
-    internal_size(viennacl::vector_slice<VectorType> const & vec)
-    {
-      return vec.get().internal_size(); 
-    }
-
-
-    //
-    // internal_size1: No. of internal (padded) rows for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size1(MatrixType const & mat) { return mat.internal_size1(); }
-
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size1(viennacl::matrix_range<MatrixType> const & mat) { return mat.get().internal_size1(); }
-
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size1(viennacl::matrix_slice<MatrixType> const & mat) { return mat.get().internal_size1(); }
-
-
-    //
-    // internal_size2: No. of internal (padded) columns for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size2(MatrixType const & mat) { return mat.internal_size2(); }
- 
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size2(viennacl::matrix_range<MatrixType> const & mat) { return mat.get().internal_size2(); }
-
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size2(viennacl::matrix_slice<MatrixType> const & mat) { return mat.get().internal_size2(); }
- 
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_TRAITS_SIZE_HPP_
 #define VIENNACL_TRAITS_SIZE_HPP_
 
@@ -561,4 +318,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/traits/start.hpp b/viennacl/traits/start.hpp
index 623ab4e..168f596 100644
--- a/viennacl/traits/start.hpp
+++ b/viennacl/traits/start.hpp
@@ -1,103 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_TRAITS_START_HPP_
-#define VIENNACL_TRAITS_START_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file start.hpp
-    @brief Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-namespace viennacl
-{
-  namespace traits
-  {
-    //
-    // start: Mostly for vectors
-    //
-    
-    // Default: Try to get the start index from the .start() member function
-    template <typename T>
-    typename result_of::size_type<T>::type
-    start(T const & obj)
-    {
-      return obj.start();
-    }
-    
-    //ViennaCL vector leads to start index 0:
-    template <typename ScalarType, unsigned int ALIGNMENT>
-    typename result_of::size_type<viennacl::vector<ScalarType, ALIGNMENT> >::type
-    start(viennacl::vector<ScalarType, ALIGNMENT> const & v)
-    {
-      return 0; 
-    }
-
-
-    //
-    // start1: Row start index
-    //
-    
-    // Default: Try to get the start index from the .start1() member function
-    template <typename T>
-    typename result_of::size_type<T>::type
-    start1(T const & obj)
-    {
-      return obj.start1();
-    }
-
-    //ViennaCL matrix leads to start index 0:
-    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
-    start1(viennacl::matrix<ScalarType, F, ALIGNMENT> const & v)
-    {
-      return 0; 
-    }
-
-
-    //
-    // start2: Column start index
-    //
-    template <typename T>
-    typename result_of::size_type<T>::type
-    start2(T const & obj)
-    {
-      return obj.start2();
-    }
-
-    //ViennaCL matrix leads to start index 0:
-    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
-    start2(viennacl::matrix<ScalarType, F, ALIGNMENT> const & v)
-    {
-      return 0; 
-    }
-    
-
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_TRAITS_START_HPP_
 #define VIENNACL_TRAITS_START_HPP_
 
@@ -199,4 +99,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/traits/stride.hpp b/viennacl/traits/stride.hpp
index 42fb4e6..1b37507 100644
--- a/viennacl/traits/stride.hpp
+++ b/viennacl/traits/stride.hpp
@@ -1,83 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_TRAITS_INC_HPP_
-#define VIENNACL_TRAITS_INC_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file stride.hpp
-    @brief Determines row and column increments for matrices and matrix proxies
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/meta/result_of.hpp"
-
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-
-  namespace traits
-  {
-
-    //
-    // inc: Increment for vectors. Defaults to 1
-    //
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type
-    stride(VectorType const & vec) { return 1; }
-
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type
-    stride(viennacl::vector_slice<VectorType> const & s) { return s.stride(); }
-
-    //
-    // inc1: Row increment for matrices. Defaults to 1
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    stride1(MatrixType const & mat) { return 1; }
-
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    stride1(matrix_slice<MatrixType> const & s) { return s.stride1(); }
-
-    //
-    // inc2: Column increment for matrices. Defaults to 1
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    stride2(MatrixType const & mat) { return 1; }
- 
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    stride2(matrix_slice<MatrixType> const & s) { return s.stride2(); }
-
- 
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
-=======
 #ifndef VIENNACL_TRAITS_STRIDE_HPP_
 #define VIENNACL_TRAITS_STRIDE_HPP_
 
@@ -153,4 +73,3 @@ namespace viennacl
 
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/vector.hpp b/viennacl/vector.hpp
index a9e5358..23e4906 100644
--- a/viennacl/vector.hpp
+++ b/viennacl/vector.hpp
@@ -1,1728 +1,3 @@
-<<<<<<< HEAD
-#ifndef VIENNACL_VECTOR_HPP_
-#define VIENNACL_VECTOR_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file vector.hpp
-    @brief The vector type with operator-overloads and proxy classes is defined here. 
-           Linear algebra operations such as norms and inner products are located in linalg/vector_operations.hpp
-*/
-
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/tools/entry_proxy.hpp"
-#include "viennacl/linalg/vector_operations.hpp"
-
-namespace viennacl
-{
-    
-    /** @brief An expression template class that represents a binary operation that yields a vector
-    *
-    * In contrast to full expression templates as introduced by Veldhuizen, ViennaCL does not allow nested expressions.
-    * The reason is that this requires automated GPU viennacl::ocl::kernel generation, which then has to be compiles just-in-time.
-    * For performance-critical applications, one better writes the appropriate viennacl::ocl::kernels by hand.
-    *
-    * Assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
-    *
-    * @tparam LHS   left hand side operand
-    * @tparam RHS   right hand side operand
-    * @tparam OP    the operator
-    */
-    template <typename LHS, typename RHS, typename OP>
-    class vector_expression
-    {
-      public:
-        /** @brief Extracts the vector type from the two operands.
-        */
-        typedef typename viennacl::tools::VECTOR_EXTRACTOR<LHS, RHS>::ResultType    VectorType;
-      
-        vector_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
-        
-        /** @brief Get left hand side operand
-        */
-        LHS & lhs() const { return _lhs; }
-        /** @brief Get right hand side operand
-        */
-        RHS & rhs() const { return _rhs; }
-        
-        /** @brief Returns the size of the result vector */
-        std::size_t size() const { return viennacl::tools::VECTOR_SIZE_DEDUCER<LHS, RHS, OP>::size(_lhs, _rhs); }
-        
-      private:
-        /** @brief The left hand side operand */
-        LHS & _lhs;
-        /** @brief The right hand side operand */
-        RHS & _rhs;
-    };
-    
-    /** @brief A STL-type const-iterator for vector elements. Elements can be accessed, but cannot be manipulated. VERY SLOW!!
-    *
-    * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
-    * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
-    * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
-    * std::vector<float> temp;
-    * copy(gpu_vector, temp);
-    * for (std::vector<float>::const_iterator iter = temp.begin();
-    *      iter != temp.end();
-    *      ++iter)
-    * {
-    *   //do something
-    * }
-    * Note that you may obtain inconsistent data if entries of gpu_vector are manipulated elsewhere in the meanwhile.
-    *
-    * @tparam SCALARTYPE  The underlying floating point type (either float or double)
-    * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class const_vector_iterator
-    {
-        typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>    self_type;
-      public:
-        typedef scalar<SCALARTYPE>            value_type;
-        typedef long                          difference_type;
-        
-        const_vector_iterator() {};
-        /** @brief Constructor
-        *   @param vec    The vector over which to iterate
-        *   @param index  The starting index of the iterator
-        */        
-        const_vector_iterator(vector<SCALARTYPE, ALIGNMENT> const & vec,
-                              cl_uint index,
-                              cl_uint start = 0,
-                              vcl_ptrdiff_t stride = 1) : elements_(vec.handle()), index_(index), start_(start), stride_(stride) {};
-                              
-        const_vector_iterator(viennacl::ocl::handle<cl_mem> const & elements,
-                              cl_uint index,
-                              cl_uint start = 0,
-                              vcl_ptrdiff_t stride = 1) : elements_(elements), index_(index), start_(start), stride_(stride) {};
-
-        
-        value_type operator*(void) const 
-        { 
-           value_type result;
-           result = entry_proxy<SCALARTYPE>(start_ + index_ * stride_, elements_);
-           return result;
-        }
-        self_type operator++(void) { index_ += stride_; return *this; }
-        self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
-        
-        bool operator==(self_type const & other) const { return index_ == other.index_; }
-        bool operator!=(self_type const & other) const { return index_ != other.index_; }
-        
-//        self_type & operator=(self_type const & other)
-//        {
-//           _index = other._index;
-//           elements_ = other._elements;
-//           return *this;
-//        }   
-
-        difference_type operator-(self_type const & other) const { difference_type result = index_; return (result - static_cast<difference_type>(other.index_)); }
-        self_type operator+(difference_type diff) const { return self_type(elements_, index_ + diff * stride_, start_, stride_); }
-        
-        //std::size_t index() const { return index_; }
-        std::size_t offset() const { return start_ + index_ * stride_; }
-        std::size_t stride() const { return stride_; }
-        viennacl::ocl::handle<cl_mem> const & handle() const { return elements_; }
-
-      protected:
-        /** @brief  The index of the entry the iterator is currently pointing to */
-        viennacl::ocl::handle<cl_mem> elements_;
-        std::size_t index_;  //offset from the beginning of elements_
-        std::size_t start_;
-        vcl_ptrdiff_t stride_;
-    };
-    
-
-    /** @brief A STL-type iterator for vector elements. Elements can be accessed and manipulated. VERY SLOW!!
-    *
-    * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
-    * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
-    * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
-    * std::vector<float> temp;
-    * copy(gpu_vector, temp);
-    * for (std::vector<float>::const_iterator iter = temp.begin();
-    *      iter != temp.end();
-    *      ++iter)
-    * {
-    *   //do something
-    * }
-    * copy(temp, gpu_vector);
-    * Note that you may obtain inconsistent data if you manipulate entries of gpu_vector in the meanwhile.
-    *
-    * @tparam SCALARTYPE  The underlying floating point type (either float or double)
-    * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class vector_iterator : public const_vector_iterator<SCALARTYPE, ALIGNMENT>
-    {
-        typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>  base_type;
-        typedef vector_iterator<SCALARTYPE, ALIGNMENT>        self_type;
-      public:
-        vector_iterator() : base_type(){};
-        vector_iterator(viennacl::ocl::handle<cl_mem> const & elements, std::size_t index)  : base_type(elements, index) {};
-        /** @brief Constructor
-        *   @param vec    The vector over which to iterate
-        *   @param index  The starting index of the iterator
-        */        
-        vector_iterator(vector<SCALARTYPE, ALIGNMENT> & vec, cl_uint index) : base_type(vec, index) {};
-        vector_iterator(base_type const & b) : base_type(b) {};
-
-        typename base_type::value_type operator*(void)  
-        { 
-           typename base_type::value_type result;
-           result = entry_proxy<SCALARTYPE>(base_type::start_ + base_type::index_ * base_type::stride_, base_type::elements_); 
-           return result;
-        }
-        
-        viennacl::ocl::handle<cl_mem> handle() { return base_type::elements_; }
-        
-        operator base_type() const
-        {
-          return base_type(base_type::elements_, base_type::index_, base_type::start_, base_type::stride_);
-        }
-    };
-
-    // forward definition in forwards.h!
-    /** @brief A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::vector
-    *
-    *  This is the basic vector type of ViennaCL. It is similar to std::vector and boost::numeric::ublas::vector and supports various linear algebra operations.
-    * By default, the internal length of the vector is padded to a multiple of 'ALIGNMENT' in order to speed up several GPU viennacl::ocl::kernels.
-    *
-    * @tparam SCALARTYPE  The floating point type, either 'float' or 'double'
-    * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class vector
-    {
-      typedef vector<SCALARTYPE, ALIGNMENT>         self_type;
-      
-    public:
-      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
-      typedef vcl_size_t                                        size_type;
-      typedef vcl_ptrdiff_t                                     difference_type;
-      typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>      const_iterator;
-      typedef vector_iterator<SCALARTYPE, ALIGNMENT>            iterator;
-      
-      static const int alignment = ALIGNMENT;
-
-      /** @brief Default constructor in order to be compatible with various containers.
-      */
-      vector() : size_(0) { viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::init();  }
-
-      /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT')
-      *
-      * @param vec_size   The length (i.e. size) of the vector.
-      */
-      explicit vector(size_type vec_size) : size_(vec_size)
-      {
-        viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::init(); 
-        
-        if (size_ > 0)
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        //force entries above size_ to zero:
-        if (size_ < internal_size())
-        {
-          std::vector<SCALARTYPE> temp(internal_size() - size_);
-          cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), elements_.get(), CL_TRUE, sizeof(SCALARTYPE)*size_, sizeof(SCALARTYPE)*(internal_size() - size_), &(temp[0]), 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
-        }
-      }
-
-      /** @brief Create a vector from existing OpenCL memory
-      *
-      * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
-      * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
-      *
-      * @param existing_mem   An OpenCL handle representing the memory
-      * @param vec_size       The size of the vector. 
-      */
-      explicit vector(cl_mem existing_mem, size_type vec_size) : size_(vec_size),  elements_(existing_mem)
-      {
-        elements_.inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
-      }
-      
-      template <typename LHS, typename RHS, typename OP>
-      vector(vector_expression<LHS, RHS, OP> const & other) : size_(other.size())
-      {
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*other.size());
-        *this = other;
-      }
-      
-      /** @brief The copy constructor
-      *
-      * Entries of 'vec' are directly copied to this vector.
-      */
-      vector(const self_type & vec) :
-        size_(vec.size())
-      {
-        viennacl::linalg::kernels::vector<SCALARTYPE, 1>::init(); 
-        
-        if (size() != 0)
-        {
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-          cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
-        }
-      }
-
-      /** @brief Assignment operator. This vector is resized if 'vec' is of a different size.
-      */
-      self_type & operator=(const self_type & vec)
-      {
-        resize(vec.size());
-        if (size() != 0)
-        {
-          cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-        }
-        return *this;
-      }
-
-
-      /** @brief Implementation of the operation v1 = alpha * v2, where alpha is a GPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      self_type & operator = (const vector_expression< VectorType,
-                                                       const scalar<SCALARTYPE>,
-                                                       op_prod> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::mult(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Implementation of the operation v1 = alpha * v2, where alpha is a CPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      self_type & operator = (const vector_expression< VectorType,
-                                                       const SCALARTYPE,
-                                                       op_prod> & proxy)
-      {
-        resize(proxy.lhs().size());
-        viennacl::linalg::mult(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Implementation of the operation v1 = v2 / alpha, where alpha is a GPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      self_type & operator = (const vector_expression< VectorType,
-                                                       const scalar<SCALARTYPE>,
-                                                       op_div> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::divide(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Implementation of the operation v1 = v2 / alpha, where alpha is a CPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      self_type & operator = (const vector_expression< VectorType,
-                                                       const SCALARTYPE,
-                                                       op_div> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::mult(proxy.lhs(), static_cast<SCALARTYPE>(1.0) / proxy.rhs(), *this);
-        return *this;
-      }
-
-      //v1 = v2 + v3; 
-      /** @brief Implementation of the operation v1 = v2 + v3
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      self_type & operator = (const vector_expression< const self_type,
-                                                       const self_type,
-                                                       op_add> & proxy)
-      {
-        assert(proxy.lhs().size() == size() && "Incompatible vector sizes!");
-        //resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      //v1 = v2 - v3; 
-      /** @brief Implementation of the operation v1 = v2 - v3
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      self_type & operator = (const vector_expression< const self_type,
-                                                       const self_type,
-                                                       op_sub> & proxy)
-      {
-        assert(proxy.lhs().size() == size() && "Incompatible vector sizes!");
-        //resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-
-      // assign vector range or vector slice (implemented in vector_proxy.hpp)
-      self_type & operator = (const vector_range<self_type> &);
-      self_type & operator = (const vector_slice<self_type> &);
-      
-      ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
-
-      //Note: The following operator overloads are defined in matrix_operations.hpp, compressed_matrix_operations.hpp and coordinate_matrix_operations.hpp
-      //This is certainly not the nicest approach and will most likely by changed in the future, but it works :-)
-      
-      //matrix<>
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                const self_type,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                          const self_type,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      //transposed_matrix_proxy:
-      /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                              const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                              op_trans >,
-                                                     const self_type,
-                                                     op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                               const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                               op_trans >,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                               const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                               op_trans >,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                            const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                            op_trans >,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                            const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                            op_trans >,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-                                                                       
-                                                                       
-      //                                                                 
-      //////////// compressed_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                     const self_type,
-                                                     op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const self_type,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const self_type,
-                                                                       op_prod> & proxy);
-
-      //
-      // coordinate_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                            const self_type,
-                            op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      //
-      // ell_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const ell_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                            const self_type,
-                            op_prod> & proxy);
-      
-      //
-      // hyb_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const hyb_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                            const self_type,
-                            op_prod> & proxy);
-      
-      //
-      // circulant_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                     const self_type,
-                                                     op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-
-      //
-      // hankel_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                     const self_type,
-                                                     op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      //
-      // toeplitz_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                     const self_type,
-                                                     op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      
-      //
-      // vandermonde_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                     const self_type,
-                                                     op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator+=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type & operator-=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                      const self_type,
-                                                      op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator+(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      self_type operator-(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                   const self_type,
-                                                   op_prod> & proxy);
-
-      
-      
-      ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
-
-      //enlarge or reduce allocated memory and set unused memory to zero
-      /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
-      *
-      *  @param new_size  The new size of the vector
-      *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
-      */
-      void resize(size_type new_size, bool preserve = true)
-      {
-        assert(new_size > 0);
-        
-        if (new_size != size_)
-        {
-          std::size_t new_internal_size = viennacl::tools::roundUpToNextMultiple<std::size_t>(new_size, ALIGNMENT);
-        
-          std::vector<SCALARTYPE> temp(size_);
-          if (preserve && size_ > 0)
-            fast_copy(*this, temp);
-          temp.resize(new_size);  //drop all entries above new_size
-          temp.resize(new_internal_size); //enlarge to fit new internal size
-          
-          if (new_internal_size != internal_size())
-          {
-            elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*new_internal_size);
-          }
-          
-          fast_copy(temp, *this);
-          size_ = new_size;
-        }
-        
-      }
-      
-
-      //read-write access to an element of the vector
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<SCALARTYPE> operator()(size_type index)
-      {
-        return entry_proxy<SCALARTYPE>(index, elements_);
-      }
-
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<SCALARTYPE> operator[](size_type index)
-      {
-        return entry_proxy<SCALARTYPE>(index, elements_);
-      }
-
-
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<SCALARTYPE> operator()(size_type index) const
-      {
-        scalar<SCALARTYPE> tmp;
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), elements_, tmp.handle().get(), sizeof(SCALARTYPE)*index, 0, sizeof(SCALARTYPE), 0, NULL, NULL);
-        //assert(err == CL_SUCCESS);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      }
-      
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<SCALARTYPE> operator[](size_type index) const
-      {
-        return operator()(index);
-      }
-      
-      /** @brief Inplace addition of a vector
-      */
-      self_type & operator += (const self_type & vec)
-      {
-        viennacl::linalg::inplace_add(*this, vec);
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
-      */
-      self_type & operator += (const vector_expression< const self_type,
-                                                        const scalar<SCALARTYPE>,
-                                                        op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a CPU scalar
-      */
-      self_type & operator += (const vector_expression< const self_type,
-                                                        const SCALARTYPE,
-                                                        op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
-      */
-      self_type & operator += (const vector_expression< const self_type,
-                                                        const scalar<SCALARTYPE>,
-                                                        op_div> & proxy)
-      {
-        viennacl::linalg::inplace_div_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-
-
-      /** @brief Inplace subtraction of a vector
-      */
-      self_type & operator -= (const self_type & vec)
-      {
-        viennacl::linalg::inplace_sub(*this, vec);
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a GPU scalar
-      */
-      self_type & operator -= (const vector_expression< const self_type,
-                                                        const scalar<SCALARTYPE>,
-                                                        op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_sub(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
-      */
-      self_type & operator -= (const vector_expression< const self_type,
-                                                        const SCALARTYPE,
-                                                        op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), -proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
-      */
-      self_type & operator -= (const vector_expression< const self_type,
-                                                        const scalar<SCALARTYPE>,
-                                                        op_div> & proxy)
-      {
-        viennacl::linalg::inplace_div_sub(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-      
-      
-      
-
-      /** @brief Scales this vector by a CPU scalar value
-      */
-      self_type & operator *= (SCALARTYPE val)
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-
-      /** @brief Scales this vector by a GPU scalar value
-      */
-      self_type & operator *= (scalar<SCALARTYPE> const & gpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, gpu_val);
-        return *this;
-      }
-
-      /** @brief Scales this vector by a CPU scalar value
-      */
-      self_type & operator /= (SCALARTYPE val)
-      {
-        viennacl::linalg::inplace_mult(*this, static_cast<SCALARTYPE>(1) / val);
-        return *this;
-      }
-      
-      /** @brief Scales this vector by a CPU scalar value
-      */
-      self_type & operator /= (scalar<SCALARTYPE> const & gpu_val)
-      {
-        viennacl::linalg::inplace_divide(*this, gpu_val);
-        return *this;
-      }
-      
-      
-      
-      // free addition
-      
-      /** @brief Adds up two vectors
-      */
-      vector_expression< const self_type, const self_type, op_add>
-      operator + (const self_type & vec) const
-      {
-        return vector_expression< const self_type, 
-                                  const self_type,
-                                  op_add>(*this, vec);
-      }
-      
-      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a GPU scalar
-      */
-      self_type operator + (const vector_expression< const self_type,
-                                                     const scalar<SCALARTYPE>,
-                                                     op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a CPU scalar
-      */
-      self_type operator + (const vector_expression< const self_type,
-                                                     const SCALARTYPE,
-                                                     op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
-        return result;
-      }
-
-
-      //
-      // free subtraction:
-      //
-      /** @brief Implementation of    result = v1 - v2
-      */
-      vector_expression< const self_type, const self_type, op_sub>
-      operator - (const self_type & vec) const
-      {
-        return vector_expression< const self_type, 
-                                  const self_type,
-                                  op_sub>(*this, vec);
-      }
-
-
-      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a GPU scalar
-      */
-      self_type operator - (const vector_expression< const self_type,
-                                                     const scalar<SCALARTYPE>,
-                                                     op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        result = *this;
-        viennacl::linalg::inplace_mul_sub(result, proxy.lhs(), proxy.rhs());
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a CPU scalar
-      */
-      self_type operator - (const vector_expression< const self_type,
-                                                     const SCALARTYPE,
-                                                     op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        result = *this;
-        viennacl::linalg::inplace_mul_add(result, proxy.lhs(), -proxy.rhs());
-        return result;
-      }
-
-      
-      //free multiplication
-      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const self_type, const SCALARTYPE, op_prod> 
-      operator * (SCALARTYPE value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_prod>(*this, value);
-      }
-
-      /** @brief Scales the vector by a GPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const self_type, const scalar<SCALARTYPE>, op_prod> 
-      operator * (scalar<SCALARTYPE> const & value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_prod>(*this, value);
-      }
-
-      //free division
-      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const self_type, const SCALARTYPE, op_div> 
-      operator / (SCALARTYPE value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_div>(*this, value);
-      }
-
-      /** @brief Scales the vector by a GPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const self_type, const scalar<SCALARTYPE>, op_div> 
-      operator / (scalar<SCALARTYPE> const & value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_div>(*this, value);
-      }
-      
-      
-      //// iterators:
-      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
-      iterator begin()
-      {
-        return iterator(*this, 0);
-      }
-
-      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
-      iterator end()
-      {
-        return iterator(*this, size());
-      }
-
-      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
-      const_iterator begin() const
-      {
-        return const_iterator(*this, 0);
-      }
-
-      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
-      const_iterator end() const
-      {
-        return const_iterator(*this, size());
-      }
-
-      /** @brief Swaps the entries of the two vectors
-      */
-      self_type & swap(self_type & other)
-      {
-        swap(*this, other);
-        return *this;
-      };
-      
-      /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
-      */ 
-      self_type & fast_swap(self_type & other) 
-      { 
-        assert(this->size_ == other.size_); 
-        this->elements_.swap(other.elements_); 
-        return *this; 
-      };       
-      
-      /** @brief Returns the length of the vector (cf. std::vector)
-      */
-      size_type size() const { return size_; }
-      
-      /** @brief Returns the maximum possible size of the vector, which is given by 128 MByte due to limitations by OpenCL.
-      */
-      size_type max_size() const
-      {
-        return (128*1024*1024) / sizeof(SCALARTYPE);  //128 MB is maximum size of memory chunks in OpenCL!
-      }
-      /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'ALIGNMENT'
-      */
-      size_type internal_size() const { return viennacl::tools::roundUpToNextMultiple<size_type>(size_, ALIGNMENT); }
-      
-      /** @brief Returns true is the size is zero */
-      bool empty() { return size_ == 0; }
-      
-      /** @brief Returns the OpenCL memory viennacl::ocl::handle. Typically used for launching compute viennacl::ocl::kernels */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
-
-      /** @brief Resets all entries to zero. Does not change the size of the vector.
-      */
-      void clear()
-      {
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "clear");
-        
-        viennacl::ocl::enqueue(k(elements_,
-                                 cl_uint(0),
-                                 cl_uint(1),  //increment
-                                 cl_uint(internal_size()))
-                              );
-      }
-      //void swap(vector & other){}
-      
-
-      //TODO: Think about implementing the following public member functions
-      //void insert_element(unsigned int i, SCALARTYPE val){}
-      //void erase_element(unsigned int i){}
-      
-    private:
-      cl_uint size_;
-      viennacl::ocl::handle<cl_mem> elements_;
-    }; //vector
-    
-
-    //
-    //////////////////// Copy from GPU to CPU //////////////////////////////////
-    //
-    
-    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
-    *
-    * @param gpu_begin  GPU constant iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_end    GPU constant iterator pointing to the end of the vector (STL-like)
-    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-              const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
-              CPU_ITERATOR cpu_begin )
-    {
-      assert(gpu_end - gpu_begin >= 0);
-      if (gpu_end - gpu_begin != 0)
-      {
-        std::vector<SCALARTYPE> temp_buffer(gpu_end - gpu_begin);
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_begin.handle().get(), CL_TRUE, 0, 
-                                         sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
-                                         &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-        
-        //now copy entries to cpu_vec:
-        std::copy(temp_buffer.begin(), temp_buffer.end(), cpu_begin);
-      }
-    }
-
-    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
-    *
-    * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
-    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-              const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
-              CPU_ITERATOR cpu_begin )
-
-    {
-      copy(const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin),
-           const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_end),
-           cpu_begin);
-    }
-    
-    /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    *
-    * @param gpu_vec    A gpu vector
-    * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void copy(vector<SCALARTYPE, ALIGNMENT> const & gpu_vec,
-              CPUVECTOR & cpu_vec )
-    {
-      viennacl::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    }
-
-    //from gpu to cpu. Type assumption: cpu_vec lies in a linear memory chunk
-    /** @brief STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
-    *
-    * This method is faster than the plain copy() function, because entries are
-    * directly written to the cpu vector, starting with &(*cpu.begin()) However,
-    * keep in mind that the cpu type MUST represent a linear piece of
-    * memory, otherwise you will run into undefined behavior.
-    *
-    * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
-    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-                   const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
-                   CPU_ITERATOR cpu_begin )
-    {
-      if (gpu_begin != gpu_end)
-      {
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.offset(),
-                                         sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
-                                         &(*cpu_begin), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-      }
-    }
-
-    /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    *
-    * @param gpu_vec    A gpu vector.
-    * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void fast_copy(vector<SCALARTYPE, ALIGNMENT> const & gpu_vec,
-                   CPUVECTOR & cpu_vec )
-    {
-      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    }
-
-
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <unsigned int ALIGNMENT>
-    void copy(vector<float, ALIGNMENT> const & gpu_vec,
-              Eigen::VectorXf & eigen_vec)
-    {
-      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
-    }
-    
-    template <unsigned int ALIGNMENT>
-    void copy(vector<double, ALIGNMENT> & gpu_vec,
-              Eigen::VectorXd & eigen_vec)
-    {
-      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
-    }
-    #endif
-
-
-    //
-    //////////////////// Copy from CPU to GPU //////////////////////////////////
-    //
-
-    //from cpu to gpu. Safe assumption: cpu_vector does not necessarily occupy a linear memory segment, but is not larger than the allocated memory on the GPU
-    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
-    *
-    * @param cpu_begin  CPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(CPU_ITERATOR const & cpu_begin,
-              CPU_ITERATOR const & cpu_end,
-              vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
-    {
-      assert(cpu_end - cpu_begin > 0);
-      if (cpu_begin != cpu_end)
-      {
-        //we require that the size of the gpu_vector is larger or equal to the cpu-size
-        std::vector<SCALARTYPE> temp_buffer(cpu_end - cpu_begin);
-        std::copy(cpu_begin, cpu_end, temp_buffer.begin());
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.offset(),
-                                          sizeof(SCALARTYPE)*(cpu_end - cpu_begin),
-                                          &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-
-    // for things like copy(std_vec.begin(), std_vec.end(), vcl_vec.begin() + 1);
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(CPU_ITERATOR const & cpu_begin,
-              CPU_ITERATOR const & cpu_end,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
-    {
-      copy(cpu_begin, cpu_end, vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin));
-    }
-
-    /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    *
-    * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
-    * @param gpu_vec    The gpu vector.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void copy(const CPUVECTOR & cpu_vec, vector<SCALARTYPE, ALIGNMENT> & gpu_vec)
-    {
-      viennacl::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    }
-
-    /** @brief STL-like transfer of a CPU vector to the GPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
-    *
-    * This method is faster than the plain copy() function, because entries are
-    * directly read from the cpu vector, starting with &(*cpu.begin()). However,
-    * keep in mind that the cpu type MUST represent a linear piece of
-    * memory, otherwise you will run into undefined behavior.
-    *
-    * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
-    * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void fast_copy(CPU_ITERATOR const & cpu_begin,
-                   CPU_ITERATOR const & cpu_end,
-                   vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
-    {
-      if (cpu_begin != cpu_end)
-      {
-        //we require that the size of the gpu_vector is larger or equal to the cpu-size
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), 
-                                          gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE) * gpu_begin.offset(), 
-                                          sizeof(SCALARTYPE)*(cpu_end - cpu_begin), &(*cpu_begin), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-
-
-    /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    *
-    * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
-    * @param gpu_vec    The gpu vector.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void fast_copy(const CPUVECTOR & cpu_vec, vector<SCALARTYPE, ALIGNMENT> & gpu_vec)
-    {
-      viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    }
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <unsigned int ALIGNMENT>
-    void copy(Eigen::VectorXf const & eigen_vec,
-              vector<float, ALIGNMENT> & gpu_vec)
-    {
-      std::vector<float> entries(eigen_vec.size());
-      for (size_t i = 0; i<entries.size(); ++i)
-        entries[i] = eigen_vec(i);
-      viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
-    }
-    
-    template <unsigned int ALIGNMENT>
-    void copy(Eigen::VectorXd const & eigen_vec,
-              vector<double, ALIGNMENT> & gpu_vec)
-    {
-      std::vector<double> entries(eigen_vec.size());
-      for (size_t i = 0; i<entries.size(); ++i)
-        entries[i] = eigen_vec(i);
-      viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
-    }
-    #endif
-    
-
-
-    //
-    //////////////////// Copy from GPU to GPU //////////////////////////////////
-    //
-    /** @brief Copy (parts of a) GPU vector to another GPU vector
-    *
-    * @param gpu_src_begin    GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_src_end      GPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_dest_begin   Output iterator for the gpu vector. The gpu_dest vector must be at least as long as the gpu_src vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
-    void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
-              vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
-    {
-      assert(gpu_src_end - gpu_src_begin >= 0);
-      
-      if (gpu_src_begin.stride() != 1)
-      {
-        std::cout << "ViennaCL ERROR: copy() for GPU->GPU not implemented for slices! Use operator= instead for the moment." << std::endl;
-        exit(EXIT_FAILURE);
-      }      
-      else if (gpu_src_begin != gpu_src_end)
-      {
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_src_begin.handle().get(),  //src handle
-                                          gpu_dest_begin.handle().get(), //dest handle
-                                          sizeof(SCALARTYPE) * gpu_src_begin.offset(), //src offset
-                                          sizeof(SCALARTYPE) * gpu_dest_begin.offset(), //dest offset
-                                          sizeof(SCALARTYPE) * (gpu_src_end.offset() - gpu_src_begin.offset()), //data length
-                                          0, //no events
-                                          NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-
-    /** @brief Copy (parts of a) GPU vector to another GPU vector
-    *
-    * @param gpu_src_begin   GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_src_end     GPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_dest_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
-    void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
-    {
-      copy(gpu_src_begin, gpu_src_end, vector_iterator<SCALARTYPE, ALIGNMENT_DEST>(gpu_dest_begin));
-    }
-
-    /** @brief Transfer from a ViennaCL vector to another ViennaCL vector. Convenience wrapper for viennacl::linalg::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
-    *
-    * @param gpu_src_vec    A gpu vector
-    * @param gpu_dest_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
-    void copy(vector<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_vec,
-              vector<SCALARTYPE, ALIGNMENT_DEST> & gpu_dest_vec )
-    {
-      viennacl::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
-    } 
-
-
-    
-    
-    
-
-    //global functions for handling vectors:
-    /** @brief Output stream. Output format is ublas compatible.
-    * @param s    STL output stream
-    * @param val  The vector that should be printed
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    std::ostream & operator<<(std::ostream & s, vector<SCALARTYPE,ALIGNMENT> const & val)
-    {
-      viennacl::ocl::get_queue().finish();
-      std::vector<SCALARTYPE> tmp(val.size());
-      copy(val.begin(), val.end(), tmp.begin());
-      std::cout << "[" << val.size() << "](";
-      for (typename std::vector<SCALARTYPE>::size_type i=0; i<val.size(); ++i)
-      {
-        if (i > 0)
-          s << ",";
-        s << tmp[i];
-      }
-      std::cout << ")";
-      return s;
-    }
-
-    /** @brief Swaps the contents of two vectors, data is copied
-    *
-    * @param vec1   The first vector
-    * @param vec2   The second vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    void swap(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
-              viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
-    {
-      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2)
-             && "Incompatible vector sizes in swap()");
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "swap");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1),
-                               cl_uint(viennacl::traits::start(vec1)),
-                               cl_uint(viennacl::traits::stride(vec1)),
-                               cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2),
-                               cl_uint(viennacl::traits::start(vec2)),
-                               cl_uint(viennacl::traits::stride(vec2)),
-                               cl_uint(viennacl::traits::size(vec2)))
-                            );
-    }
-    
-    /** @brief Swaps the content of two vectors by swapping OpenCL handles only, NO data is copied
-    *
-    * @param v1   The first vector
-    * @param v2   The second vector
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    vector<SCALARTYPE, ALIGNMENT> & fast_swap(vector<SCALARTYPE, ALIGNMENT> & v1,
-                                              vector<SCALARTYPE, ALIGNMENT> & v2) 
-    { 
-      return v1.fast_swap(v2);
-    }       
-    
-    
-    
-    ////////// operations /////////////
-    /** @brief Operator overload for the expression alpha * v1, where alpha is a host scalar (float or double) and v1 is a ViennaCL vector.
-    *
-    * @param value   The host scalar (float or double)
-    * @param vec     A ViennaCL vector
-    */
-    template <typename SCALARTYPE, unsigned int A>
-    vector_expression< const vector<SCALARTYPE, A>, const SCALARTYPE, op_prod> operator * (SCALARTYPE const & value, vector<SCALARTYPE, A> const & vec)
-    {
-      return vector_expression< const vector<SCALARTYPE, A>, const SCALARTYPE, op_prod>(vec, value);
-    }
-
-    /** @brief Operator overload for the expression alpha * v1, where alpha is a ViennaCL scalar (float or double) and v1 is a ViennaCL vector.
-    *
-    * @param value   The ViennaCL scalar
-    * @param vec     A ViennaCL vector
-    */
-    template <typename SCALARTYPE, unsigned int A>
-    vector_expression< const vector<SCALARTYPE, A>, const scalar<SCALARTYPE>, op_prod> operator * (scalar<SCALARTYPE> const & value, vector<SCALARTYPE, A> const & vec)
-    {
-        return vector_expression< const vector<SCALARTYPE, A>, const scalar<SCALARTYPE>, op_prod>(vec, value);
-    }
-
-
-    //addition and subtraction of two vector_expressions:
-    /** @brief Operator overload for the addition of two vector expressions.
-    *
-    * @param proxy1  Left hand side vector expression
-    * @param proxy2  Right hand side vector expression
-    */
-    template <typename LHS1, typename RHS1, typename OP1,
-              typename LHS2, typename RHS2, typename OP2>
-    typename vector_expression< LHS1, RHS1, OP1>::VectorType
-    operator + (vector_expression< LHS1, RHS1, OP1> const & proxy1,
-                vector_expression< LHS2, RHS2, OP2> const & proxy2)
-    {
-      assert(proxy1.size() == proxy2.size());
-      typename vector_expression< LHS1, RHS1, OP1>::VectorType result(proxy1.size());
-      result = proxy1;
-      result += proxy2;
-      return result;
-    }
-
-    /** @brief Operator overload for the subtraction of two vector expressions.
-    *
-    * @param proxy1  Left hand side vector expression
-    * @param proxy2  Right hand side vector expression
-    */
-    template <typename LHS1, typename RHS1, typename OP1,
-              typename LHS2, typename RHS2, typename OP2>
-    typename vector_expression< LHS1, RHS1, OP1>::VectorType
-    operator - (vector_expression< LHS1, RHS1, OP1> const & proxy1,
-                vector_expression< LHS2, RHS2, OP2> const & proxy2)
-    {
-      assert(proxy1.size() == proxy2.size());
-      typename vector_expression< LHS1, RHS1, OP1>::VectorType result(proxy1.size());
-      result = proxy1;
-      result -= proxy2;
-      return result;
-    }
-    
-    //////////// one vector expression from left /////////////////////////////////////////
-    
-    /** @brief Operator overload for the addition of a vector expression from the left, e.g. alpha * vec1 + vec2. Here, alpha * vec1 is wrapped into a vector_expression and then added to vec2.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param vec     Right hand side vector
-    */
-    template <typename SCALARTYPE, unsigned int A, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE, A> operator + (vector_expression< LHS, RHS, OP> const & proxy,
-                                      vector<SCALARTYPE, A> const & vec)
-    {
-      assert(proxy.size() == vec.size());
-      vector<SCALARTYPE, A> result(vec.size());
-      result = proxy;
-      result += vec;
-      return result;
-    }
-
-    /** @brief Operator overload for the subtraction of a vector expression from the left, e.g. alpha * vec1 + vec2. Here, alpha * vec1 is wrapped into a vector_expression and then added to vec2.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param vec     Right hand side vector
-    */
-    template <typename SCALARTYPE, unsigned int A, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE, A> operator - (vector_expression< LHS, RHS, OP> const & proxy,
-                                      vector<SCALARTYPE, A> const & vec)
-    {
-      assert(proxy.size() == vec.size());
-      vector<SCALARTYPE, A> result(vec.size());
-      result = proxy;
-      result -= vec;
-      return result;
-    }
-
-
-    /** @brief Operator overload for the multiplication of a vector expression with a scalar from the right, e.g. (beta * vec1) * alpha. Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the right.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param val     Right hand side scalar
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE> operator * (vector_expression< LHS, RHS, OP> const & proxy,
-                                   scalar<SCALARTYPE> const & val)
-    {
-      vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result *= val;
-      return result;
-    }
-
-    /** @brief Operator overload for the division of a vector expression by a scalar from the right, e.g. (beta * vec1) / alpha. Here, beta * vec1 is wrapped into a vector_expression and then divided by alpha.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param val     Right hand side scalar
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE> operator / (vector_expression< LHS, RHS, OP> const & proxy,
-                                      scalar<SCALARTYPE> const & val)
-    {
-      vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result /= val;
-      return result;
-    }
-
-
-    //////////// one vector expression from right (on scalar) ///////////////////////
-    
-    /** @brief Operator overload for the multiplication of a vector expression with a ViennaCL scalar from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
-    *
-    * @param val     Right hand side scalar
-    * @param proxy   Left hand side vector expression
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE> operator * (scalar<SCALARTYPE> const & val,
-                                   vector_expression< LHS, RHS, OP> const & proxy)
-    {
-      vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result *= val;
-      return result;
-    }
-    
-    /** @brief Operator overload for the multiplication of a vector expression with a host scalar (float or double) from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
-    *
-    * @param val     Right hand side scalar
-    * @param proxy   Left hand side vector expression
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    viennacl::vector<SCALARTYPE> operator * (SCALARTYPE val,
-                                   viennacl::vector_expression< LHS, RHS, OP> const & proxy)
-    {
-      viennacl::vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result *= val;
-      return result;
-    }
-
-}
-
-#endif
-=======
 #ifndef VIENNACL_VECTOR_HPP_
 #define VIENNACL_VECTOR_HPP_
 
@@ -4963,4 +3238,3 @@ namespace viennacl
 } // namespace viennacl
 
 #endif
->>>>>>> upstream/1.5.1
diff --git a/viennacl/vector_proxy.hpp b/viennacl/vector_proxy.hpp
index 26d4847..a7f2cfa 100644
--- a/viennacl/vector_proxy.hpp
+++ b/viennacl/vector_proxy.hpp
@@ -38,12 +38,8 @@ namespace viennacl
   class vector_range : public vector_base<typename VectorType::cpu_value_type>
   {
       typedef vector_range<VectorType>             self_type;
-<<<<<<< HEAD
-    
-=======
       typedef vector_base<typename VectorType::cpu_value_type> base_type;
 
->>>>>>> upstream/1.5.1
     public:
       typedef typename VectorType::value_type      value_type;
       typedef range::size_type                     size_type;
@@ -52,210 +48,6 @@ namespace viennacl
       typedef const value_type &                   const_reference;
       typedef typename VectorType::const_iterator  const_iterator;
       typedef typename VectorType::iterator        iterator;
-<<<<<<< HEAD
-      
-
-      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
-      
-      static const int alignment = VectorType::alignment;
-      
-      vector_range(VectorType & v, 
-                   range const & entry_range) : v_(v), entry_range_(entry_range) {}
-                   
-      size_type start() const { return entry_range_.start(); }
-      size_type size() const { return entry_range_.size(); }
-
-      
-      /** @brief Operator overload for v1 = A * v2, where v1 and v2 are vector ranges and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename MatrixType>
-      typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value, self_type &>::type
-      operator=(const vector_expression< const MatrixType,
-                                         const self_type,
-                                         op_prod> & proxy);
-      
-      
-      
-
-      template <typename LHS, typename RHS, typename OP>
-      self_type & operator=(const vector_expression< LHS,
-                                                     RHS,
-                                                     OP > & proxy) 
-      {
-        VectorType temp = proxy;
-        *this = temp;
-        return *this;
-      }      
-
-
-      /** @brief Convenience function, which allows to assign a vector directly to a vector range of suitable size */
-      self_type & operator=(const VectorType & v) 
-      {
-        assert(size() == v.size() && "Vector range and vector size mismatch!");
-        
-        if (size() > 0)
-        {
-          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                           v.handle().get(),     // src buffer
-                                           v_.handle().get(),    //dest buffer
-                                           0,                    // src offset
-                                           sizeof(cpu_value_type) * start(), //dest offset
-                                           sizeof(cpu_value_type) * size(),  //number of bytes to be copied
-                                           0, NULL, NULL);
-                                           
-          VIENNACL_ERR_CHECK(err);
-        }
-        
-        return *this;
-      }      
-
-      /** @brief Convenience function, which allows to assign a vector range directly to another vector range of suitable size */
-      self_type & operator=(const self_type & v) 
-      {
-        assert(size() == v.size() && "Sizes of vector ranges don't match!");
-        
-        if (size() > 0)
-        {
-          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                           v.get().handle().get(),   // src buffer
-                                           v_.handle().get(),        //dest buffer
-                                           sizeof(cpu_value_type) * v.start(),   // src offset
-                                           sizeof(cpu_value_type) * start(),     //dest offset
-                                           sizeof(cpu_value_type) * size(),      //number of bytes to be copied
-                                           0, NULL, NULL);
-                                           
-          VIENNACL_ERR_CHECK(err);
-        }
-        
-        return *this;
-      }      
-
-      ///////////// operator +=
-
-      self_type & operator += (VectorType const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      self_type & operator += (self_type const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-      
-      ///////////// operator -=
-
-      self_type & operator -= (VectorType const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      self_type & operator -= (self_type const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      ///////////// operator *=
-      self_type & operator *= (cpu_value_type const & cpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, cpu_val);
-        return *this;
-      }
-      
-      self_type & operator *= (value_type const & gpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, gpu_val);
-        return *this;
-      }
-
-      ///////////// operator /=
-      self_type & operator /= (cpu_value_type const & cpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, cpu_value_type(1) / cpu_val);
-        return *this;
-      }
-      
-      self_type & operator /= (value_type const & gpu_val)
-      {
-        viennacl::linalg::inplace_divide(*this, gpu_val);
-        return *this;
-      }
-      
-      
-      ///////////// Direct manipulation via operator() and operator[]
-      //read-write access to an element of the vector
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<cpu_value_type> operator()(size_type index)
-      {
-        return entry_proxy<cpu_value_type>(index + start(), v_.get());
-      }
-
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<cpu_value_type> operator[](size_type index)
-      {
-        return entry_proxy<cpu_value_type>(index + start(), v_.get());
-      }
-
-
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<cpu_value_type> operator()(size_type index) const
-      {
-        scalar<cpu_value_type> tmp;
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), v_.get(), tmp.handle(), sizeof(cpu_value_type)*(index + start()), 0, sizeof(cpu_value_type), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      }
-      
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<cpu_value_type> operator[](size_type index) const
-      {
-        return operator()(index);
-      }
-      
-      ///////////// iterators:
-      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
-      iterator begin()
-      {
-        return iterator(v_, 0, start());
-      }
-
-      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
-      iterator end()
-      {
-        return iterator(v_, size(), start());
-      }
-
-      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
-      const_iterator begin() const
-      {
-        return const_iterator(v_, start());
-      }
-
-      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
-      const_iterator end() const
-      {
-        return const_iterator(v_, size(), start());
-      }
-      
-      ///////////// Misc
-
-      VectorType & get() { return v_; }
-      const VectorType & get() const { return v_; }
-
-    private:
-      VectorType & v_;
-      range entry_range_;
-=======
 
       typedef typename VectorType::cpu_value_type    cpu_value_type;
 
@@ -267,47 +59,14 @@ namespace viennacl
 
       using base_type::operator=;
 
->>>>>>> upstream/1.5.1
   };
-  
-  
-  //implement operator= for vector:
-  
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const vector_range< viennacl::vector<SCALARTYPE, ALIGNMENT> > & r) 
-  {
-    assert(this->size() == r.size() && "Vector size mismatch!");
-    
-    if (this->size() > 0)
-    {
-      cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        r.get().handle().get(),      // src buffer
-                                        this->handle().get(),        //dest buffer
-                                        sizeof(SCALARTYPE) * r.start(),       // src offset
-                                        0,                                    //dest offset
-                                        sizeof(SCALARTYPE) * r.size(), //number of bytes to be copied
-                                        0, NULL, NULL);
-                                        
-      VIENNACL_ERR_CHECK(err);
-    }
-    
-    return *this;
-  }
-  
-  
-  
 
 
 
   /////////////////////////////////////////////////////////////
   ///////////////////////// CPU to GPU ////////////////////////
   /////////////////////////////////////////////////////////////
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/1.5.1
   template <typename VectorType, typename SCALARTYPE>
   void copy(const VectorType & cpu_vector,
             vector_range<vector<SCALARTYPE> > & gpu_vector_range )
@@ -339,10 +98,7 @@ namespace viennacl
   ///////////////////////// GPU to CPU ////////////////////////
   /////////////////////////////////////////////////////////////
 
-<<<<<<< HEAD
-=======
 
->>>>>>> upstream/1.5.1
   template <typename SCALARTYPE, typename VectorType>
   void copy(vector_range<vector<SCALARTYPE> > const & gpu_vector_range,
             VectorType & cpu_vector)
@@ -383,15 +139,12 @@ namespace viennacl
     return vector_range<VectorType>(vec, r1);
   }
 
-<<<<<<< HEAD
-=======
   template <typename VectorType>
   vector_range<VectorType> project(viennacl::vector_range<VectorType> & vec, viennacl::range const & r1)
   {
     assert(r1.size() <= vec.size() && bool("Size of range invalid!"));
     return vector_range<VectorType>(vec, viennacl::range(vec.start() + r1.start(), vec.start() + r1.start() + r1.size()));
   }
->>>>>>> upstream/1.5.1
 
 //
 //
@@ -403,14 +156,6 @@ namespace viennacl
 
 
 
-<<<<<<< HEAD
-
-  template <typename VectorType>
-  class vector_slice
-  {
-      typedef vector_slice<VectorType>             self_type;
-    
-=======
   /** @brief Class for representing strided subvectors of a bigger vector x.
     *
     * In MATLAB notation, this could for example refer to the subvector x(3:2:8) of a vector x.
@@ -421,7 +166,6 @@ namespace viennacl
       typedef vector_slice<VectorType>             self_type;
       typedef vector_base<typename VectorType::cpu_value_type> base_type;
 
->>>>>>> upstream/1.5.1
     public:
       typedef typename VectorType::value_type      value_type;
       typedef slice::size_type                     size_type;
@@ -430,233 +174,6 @@ namespace viennacl
       typedef const value_type &                   const_reference;
       typedef typename VectorType::const_iterator  const_iterator;
       typedef typename VectorType::iterator        iterator;
-<<<<<<< HEAD
-      
-
-      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
-      
-      static const int alignment = VectorType::alignment;
-      
-      vector_slice(VectorType & v, 
-                   slice const & entry_slice) : v_(v), entry_slice_(entry_slice) {}
-                   
-      size_type start() const { return entry_slice_.start(); }
-      size_type stride() const { return entry_slice_.stride(); }
-      size_type size() const { return entry_slice_.size(); }
-
-      
-      /** @brief Operator overload for v1 = A * v2, where v1 and v2 are vector slices and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename MatrixType>
-      typename viennacl::enable_if< viennacl::is_matrix<MatrixType>::value, self_type &>::type
-      operator=(const vector_expression< const MatrixType,
-                                         const self_type,
-                                         op_prod> & proxy);
-      
-      
-      
-
-      template <typename LHS, typename RHS, typename OP>
-      self_type & operator=(const vector_expression< LHS,
-                                                     RHS,
-                                                     OP > & proxy) 
-      {
-        VectorType temp = proxy;
-        *this = temp;
-        return *this;
-      }      
-
-
-      /** @brief Convenience function, which allows to assign a vector directly to a vector slice of suitable size */
-      self_type & operator=(const VectorType & v) 
-      {
-        assert(size() == v.size() && "Vector slice and vector size mismatch!");
-        
-        if (size() > 0)
-          viennacl::linalg::assign(*this, v);
-        
-        return *this;
-      }      
-
-      /** @brief Convenience function, which allows to assign a vector slice directly to another vector slice of suitable size */
-      self_type & operator=(const self_type & v) 
-      {
-        assert(size() == v.size() && "Sizes of vector slices don't match!");
-        
-        if (size() > 0)
-          viennacl::linalg::assign(*this, v);
-        
-        return *this;
-      }      
-
-      ///////////// operator +=
-
-      self_type & operator += (VectorType const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      self_type & operator += (self_type const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-      
-      ///////////// operator -=
-
-      self_type & operator -= (VectorType const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      self_type & operator -= (self_type const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      ///////////// operator *=
-      self_type & operator *= (cpu_value_type const & cpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, cpu_val);
-        return *this;
-      }
-      
-      self_type & operator *= (value_type const & gpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, gpu_val);
-        return *this;
-      }
-
-      ///////////// operator /=
-      self_type & operator /= (cpu_value_type const & cpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, cpu_value_type(1) / cpu_val);
-        return *this;
-      }
-      
-      self_type & operator /= (value_type const & gpu_val)
-      {
-        viennacl::linalg::inplace_divide(*this, gpu_val);
-        return *this;
-      }
-      
-      
-      ///////////// Direct manipulation via operator() and operator[]
-      //read-write access to an element of the vector
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<cpu_value_type> operator()(size_type index)
-      {
-        return entry_proxy<cpu_value_type>(index * stride() + start(), v_.get());
-      }
-
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<cpu_value_type> operator[](size_type index)
-      {
-        return entry_proxy<cpu_value_type>(index * stride() + start(), v_.get());
-      }
-
-
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<cpu_value_type> operator()(size_type index) const
-      {
-        scalar<cpu_value_type> tmp;
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), v_.get(), tmp.handle(), sizeof(cpu_value_type)*(index * stride() + start()), 0, sizeof(cpu_value_type), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      }
-      
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<cpu_value_type> operator[](size_type index) const
-      {
-        return operator()(index);
-      }
-      
-      ///////////// iterators:
-      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
-      iterator begin()
-      {
-        return iterator(v_, 0, start(), stride());
-      }
-
-      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
-      iterator end()
-      {
-        return iterator(v_, size(), start(), stride());
-      }
-
-      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
-      const_iterator begin() const
-      {
-        return const_iterator(v_, 0, start(), stride());
-      }
-
-      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
-      const_iterator end() const
-      {
-        return const_iterator(v_, size(), start(), stride());
-      }
-      
-      ///////////// Misc
-
-      VectorType & get() { return v_; }
-      const VectorType & get() const { return v_; }
-
-    private:
-      VectorType & v_;
-      slice entry_slice_;
-  };
-  
-  
-  //implement operator= for vector:
-  
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const vector_slice< viennacl::vector<SCALARTYPE, ALIGNMENT> > & r) 
-  {
-    assert(this->size() == r.size() && "Vector size mismatch!");
-    
-    if (this->size() > 0)
-      viennacl::linalg::assign(*this, r);
-    
-    return *this;
-  }
-  
-  
-  
-
-  
-  template<typename VectorType>
-  std::ostream & operator<<(std::ostream & s, vector_slice<VectorType> const & proxy)
-  {
-    typedef typename VectorType::value_type   ScalarType;
-    std::vector<ScalarType> temp(proxy.size());
-    viennacl::copy(proxy, temp);
-    
-    //instead of printing 'temp' directly, let's reuse the existing functionality for viennacl::vector. It certainly adds overhead, but printing a vector is typically not about performance...
-    VectorType temp2(temp.size());
-    viennacl::copy(temp, temp2);
-    s << temp2;
-    return s;
-  }
-  
-  
-  
-  
-  /////////////////////////////////////////////////////////////
-  ///////////////////////// CPU to GPU ////////////////////////
-  /////////////////////////////////////////////////////////////
-  
-=======
 
       typedef typename VectorType::cpu_value_type  cpu_value_type;
 
@@ -675,39 +192,10 @@ namespace viennacl
   ///////////////////////// CPU to GPU ////////////////////////
   /////////////////////////////////////////////////////////////
 
->>>>>>> upstream/1.5.1
   template <typename VectorType, typename SCALARTYPE>
   void copy(const VectorType & cpu_vector,
             vector_slice<vector<SCALARTYPE> > & gpu_vector_slice )
   {
-<<<<<<< HEAD
-    assert(cpu_vector.end() - cpu_vector.begin() >= 0);
-    
-    if (cpu_vector.end() - cpu_vector.begin() > 0)
-    {
-      
-      // OpenCL 1.0 version: (no use of clEnqueueWriteBufferRect())
-      std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
-      
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        gpu_vector_slice.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_slice.start(), 
-                                        sizeof(SCALARTYPE)*temp_buffer.size(),
-                                        &(temp_buffer[0]), 0, NULL, NULL);
-      
-      VIENNACL_ERR_CHECK(err);
-
-      for (std::size_t i=0; i<cpu_vector.size(); ++i)
-      {
-        temp_buffer[i * gpu_vector_slice.stride()] = cpu_vector[i];
-      }
-      
-      err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                 gpu_vector_slice.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_slice.start(),
-                                 sizeof(SCALARTYPE)*temp_buffer.size(),
-                                 &(temp_buffer[0]), 0, NULL, NULL);
-      
-      VIENNACL_ERR_CHECK(err);
-=======
     if (cpu_vector.size() > 0)
     {
       std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
@@ -718,7 +206,6 @@ namespace viennacl
         temp_buffer[i * gpu_vector_slice.stride()] = cpu_vector[i];
 
       viennacl::backend::memory_write(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
->>>>>>> upstream/1.5.1
     }
   }
 
@@ -727,36 +214,12 @@ namespace viennacl
   /////////////////////////////////////////////////////////////
   ///////////////////////// GPU to CPU ////////////////////////
   /////////////////////////////////////////////////////////////
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/1.5.1
 
   template <typename VectorType, typename SCALARTYPE>
   void copy(vector_slice<vector<SCALARTYPE> > const & gpu_vector_slice,
             VectorType & cpu_vector)
   {
-<<<<<<< HEAD
-    assert(cpu_vector.end() - cpu_vector.begin() >= 0);
-    
-    if (cpu_vector.end() > cpu_vector.begin())
-    {
-      // OpenCL 1.0 version: (no use of clEnqueueWriteBufferRect())
-      std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
-      
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        gpu_vector_slice.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_slice.start(), 
-                                        sizeof(SCALARTYPE)*temp_buffer.size(),
-                                        &(temp_buffer[0]), 0, NULL, NULL);
-      
-      VIENNACL_ERR_CHECK(err);
-
-      for (std::size_t i=0; i<cpu_vector.size(); ++i)
-      {
-        cpu_vector[i] = temp_buffer[i * gpu_vector_slice.stride()];
-      }
-=======
     assert(gpu_vector_slice.end() - gpu_vector_slice.begin() >= 0 && bool("Range must have nonnegative length!"));
 
     if (gpu_vector_slice.end() - gpu_vector_slice.begin() > 0)
@@ -766,7 +229,6 @@ namespace viennacl
 
       for (vcl_size_t i=0; i<cpu_vector.size(); ++i)
         cpu_vector[i] = temp_buffer[i * gpu_vector_slice.stride()];
->>>>>>> upstream/1.5.1
     }
   }
 
@@ -775,20 +237,11 @@ namespace viennacl
 
 
   //
-<<<<<<< HEAD
-  // Convenience function
-=======
   // Convenience functions
->>>>>>> upstream/1.5.1
   //
   template <typename VectorType>
   vector_slice<VectorType> project(VectorType & vec, viennacl::slice const & s1)
   {
-<<<<<<< HEAD
-    return vector_slice<VectorType>(vec, s1);
-  }
-
-=======
     assert(s1.size() <= vec.size() && bool("Size of slice larger than vector size!"));
     return vector_slice<VectorType>(vec, s1);
   }
@@ -817,7 +270,6 @@ namespace viennacl
   }
 
 
->>>>>>> upstream/1.5.1
 }
 
 #endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/viennacl.git



More information about the debian-science-commits mailing list