[hypre] 01/04: New upstream version 2.12.1

Wed Oct 18 11:15:23 UTC 2017

This is an automated email from the git hooks/post-receive script.

dparsons pushed a commit to branch experimental
in repository hypre.

commit b0efb9cf78d99b6a7f42a3dc709864a9c838189b
Author: Drew Parsons <dparsons at debian.org>
Date:   Wed Oct 18 18:49:18 2017 +0800

    New upstream version 2.12.1
---
 AUTOTEST/README.txt                                |   27 +-
 AUTOTEST/autotest.sh                               |  283 --
 AUTOTEST/{basictest.sh => basic.sh}                |    0
 AUTOTEST/check-double.filters                      |    9 +-
 AUTOTEST/check-int.filters                         |    8 +
 AUTOTEST/check-mpi.filters                         |   11 +-
 AUTOTEST/{cmaketest.sh => cmake.sh}                |    2 +-
 AUTOTEST/cronfile                                  |   29 -
 AUTOTEST/machine-mac.sh                            |   12 +-
 AUTOTEST/{machine-rzmerl.sh => machine-rztopaz.sh} |   20 +-
 AUTOTEST/{machine-rzzeus.sh => machine-syrah.sh}   |   20 +-
 AUTOTEST/machine-tux.sh                            |  102 +-
 AUTOTEST/machine-vulcan.sh                         |   12 +-
 AUTOTEST/testdist.sh                               |  174 --
 AUTOTEST/testsrc.sh                                |   89 -
 CHANGELOG                                          |   14 +
 docs/HYPRE_ref_manual.pdf                          |  Bin 874193 -> 874961 bytes
 docs/HYPRE_usr_manual.pdf                          |  Bin 653348 -> 653348 bytes
 src/CMakeLists.txt                                 |   16 +-
 src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx             |    6 +-
 src/FEI_mv/fei-hypre/Makefile                      |    4 +-
 src/IJ_mv/IJVector_parcsr.c                        |    5 +-
 src/Makefile                                       |    1 +
 src/blas/hypre_blas.h                              |    4 +-
 src/config/HYPRE_config.h.cmake.in                 |    6 +
 src/config/HYPRE_config.h.in                       |    9 +
 src/config/Makefile.config.in                      |   34 +-
 src/config/config.guess                            |   20 +-
 src/config/configure.in                            |  428 ++-
 src/configure                                      |  787 +++++-
 src/distributed_ls/Euclid/Euclid_dh.c              |   17 +-
 src/distributed_ls/Euclid/ExternalRows_dh.c        |    4 +-
 src/distributed_ls/Euclid/Factor_dh.c              |   10 +-
 src/distributed_ls/Euclid/Mat_dh.c                 |    8 +-
 src/distributed_ls/Euclid/TimeLog_dh.c             |    4 +-
 src/distributed_ls/Euclid/_hypre_Euclid.h          |   12 +-
 src/distributed_ls/Euclid/blas_dh.c                |    4 +-
 src/distributed_ls/Euclid/globalObjects.c          |    3 +
 src/distributed_ls/Euclid/macros_dh.h              |    4 +
 src/distributed_ls/Euclid/mat_dh_private.c         |    8 +-
 src/distributed_ls/ParaSails/ConjGrad.c            |    2 +-
 src/distributed_ls/ParaSails/DiagScale.c           |    4 +-
 src/distributed_ls/ParaSails/FGmres.c              |    2 +-
 src/distributed_ls/ParaSails/LoadBal.c             |    8 +-
 src/distributed_ls/ParaSails/Matrix.c              |   12 +-
 src/distributed_ls/ParaSails/ParaSails.c           |   18 +-
 src/distributed_ls/ParaSails/driver.c              |   12 +-
 src/distributed_ls/ParaSails/lapdriver.c           |    4 +-
 src/distributed_ls/pilut/comm.c                    |    6 +-
 src/distributed_ls/pilut/parilut.c                 |    4 +-
 src/distributed_ls/pilut/trifactor.c               |    8 +-
 src/examples/README_files/ex12f.f.html             |    2 +-
 src/examples/ex12f.f                               |    2 +-
 src/krylov/bicgstab.c                              |    3 +-
 src/krylov/gmres.c                                 |    1 -
 src/krylov/lobpcg.c                                |    2 +-
 src/krylov/pcg.c                                   |    4 +-
 src/lapack/dbdsqr.c                                |    2 +-
 src/lapack/dgesvd.c                                |    1 -
 src/lapack/dlabad.c                                |    2 +-
 src/lapack/dlae2.c                                 |    2 -
 src/lapack/dlaev2.c                                |    2 -
 src/lapack/dlanst.c                                |    2 -
 src/lapack/dlartg.c                                |   11 +-
 src/lapack/dlas2.c                                 |    2 -
 src/lapack/dlasq1.c                                |    3 -
 src/lapack/dlasq2.c                                |    3 -
 src/lapack/dlasq3.c                                |    2 -
 src/lapack/dlasq4.c                                |    3 -
 src/lapack/dlasv2.c                                |    2 +-
 src/lapack/dsteqr.c                                |    2 +-
 src/lapack/dsterf.c                                |    2 +-
 src/lapack/dsyev.c                                 |    2 -
 src/lapack/hypre_lapack.h                          |    6 +
 src/multivector/backup.c                           |    4 +-
 src/multivector/csr_matmultivec.c                  |    2 +-
 src/parcsr_ls/F90_HYPRE_parcsr_amg.c               |  106 +-
 src/parcsr_ls/HYPRE_parcsr_amg.c                   |   48 +-
 src/parcsr_ls/HYPRE_parcsr_ls.h                    |   50 +
 src/parcsr_ls/Makefile                             |    2 +
 src/parcsr_ls/_hypre_parcsr_ls.h                   |   44 +-
 src/parcsr_ls/ams.c                                |   42 +-
 src/parcsr_ls/gen_redcs_mat.c                      |   14 +-
 src/parcsr_ls/par_add_cycle.c                      |  226 +-
 src/parcsr_ls/par_amg.c                            |  107 +
 src/parcsr_ls/par_amg.h                            |   18 +-
 src/parcsr_ls/par_amg_setup.c                      |  275 +-
 src/parcsr_ls/par_cheby.c                          |  338 +++
 src/parcsr_ls/par_coarsen.c                        |    3 +-
 src/parcsr_ls/par_cr.c                             |   18 +-
 src/parcsr_ls/par_cycle.c                          |   34 +-
 src/parcsr_ls/par_gsmg.c                           |    2 +-
 src/parcsr_ls/par_jacobi_interp.c                  |    2 +-
 src/parcsr_ls/par_relax.c                          |   26 +-
 src/parcsr_ls/par_relax_more.c                     |   53 +-
 src/parcsr_ls/par_stats.c                          |   84 +-
 src/parcsr_ls/par_strength.c                       |    1 +
 src/parcsr_ls/par_vardifconv.c                     |    9 +-
 .../{par_vardifconv.c => par_vardifconv_rs.c}      |  309 ++-
 src/parcsr_mv/_hypre_parcsr_mv.h                   |    6 +
 src/parcsr_mv/par_csr_communication.c              |    2 +-
 src/parcsr_mv/par_csr_matrix.c                     |    8 +
 src/parcsr_mv/par_csr_matvec.c                     |   47 +-
 src/parcsr_mv/par_vector.c                         |    6 +
 src/seq_mv/Makefile                                |    8 +-
 src/seq_mv/Makefile.empty                          |    1 +
 src/seq_mv/Makefile.nvcc                           |    4 +
 src/seq_mv/csr_matrix.c                            |   45 +-
 src/seq_mv/csr_matvec.c                            |   85 +-
 src/seq_mv/gpukernels.cu                           |  243 ++
 src/seq_mv/gpukernels.h                            |    9 +
 src/seq_mv/headers                                 |    1 +
 src/seq_mv/seq_mv.h                                |   42 +-
 src/seq_mv/vector.c                                |  161 +-
 src/sstruct_ls/HYPRE_sstruct_int.c                 |    8 +-
 src/sstruct_ls/fac_amr_fcoarsen.c                  |   48 +-
 src/sstruct_ls/fac_amr_rap.c                       |    7 +-
 src/sstruct_ls/fac_cf_coarsen.c                    |   36 +-
 src/sstruct_ls/fac_interp2.c                       |   37 +-
 src/sstruct_ls/fac_restrict2.c                     |   25 +-
 src/sstruct_ls/fac_setup2.c                        |   11 +-
 src/sstruct_ls/fac_zero_stencilcoef.c              |   20 +-
 src/sstruct_ls/maxwell_PNedelec.c                  |  350 +--
 src/sstruct_ls/maxwell_TV_setup.c                  |    9 +-
 src/sstruct_ls/maxwell_grad.c                      |   30 +-
 src/sstruct_ls/maxwell_physbdy.c                   |   12 +-
 src/sstruct_ls/maxwell_semi_interp.c               |  435 +--
 src/sstruct_ls/node_relax.c                        |   36 +-
 src/sstruct_ls/sstruct_sharedDOFComm.c             |   14 +-
 src/sstruct_mv/_hypre_sstruct_mv.h                 |    3 +
 src/sstruct_mv/headers                             |    3 +
 src/sstruct_mv/sstruct_matrix.c                    |   44 +-
 src/sstruct_mv/sstruct_vector.c                    |    8 +-
 src/struct_ls/HYPRE_struct_int.c                   |    9 +-
 src/struct_ls/HYPRE_struct_pcg.c                   |    6 +-
 src/struct_ls/_hypre_struct_ls.h                   |    2 -
 src/struct_ls/cyclic_reduction.c                   |   65 +-
 src/struct_ls/pfmg.c                               |   11 +-
 src/struct_ls/pfmg2_setup_rap.c                    |   92 +-
 src/struct_ls/pfmg3_setup_rap.c                    |  191 +-
 src/struct_ls/pfmg_setup.c                         |  274 +-
 src/struct_ls/pfmg_setup_interp.c                  |   70 +-
 src/struct_ls/pfmg_setup_rap5.c                    |   15 +-
 src/struct_ls/pfmg_setup_rap7.c                    |   10 +-
 src/struct_ls/pfmg_solve.c                         |    2 +-
 src/struct_ls/point_relax.c                        |   62 +-
 src/struct_ls/red_black_constantcoef_gs.c          |  229 +-
 src/struct_ls/red_black_gs.c                       |  125 +-
 src/struct_ls/red_black_gs.h                       |  223 ++
 src/struct_ls/semi_interp.c                        |    8 +-
 src/struct_ls/semi_restrict.c                      |    6 +-
 src/struct_ls/semi_setup_rap.c                     |   19 +-
 src/struct_ls/smg.c                                |    7 +-
 src/struct_ls/smg2_setup_rap.c                     |   72 +-
 src/struct_ls/smg3_setup_rap.c                     |  108 +-
 src/struct_ls/smg_axpy.c                           |    5 +-
 src/struct_ls/smg_relax.c                          |    3 +-
 src/struct_ls/smg_residual.c                       |    9 +-
 src/struct_ls/smg_setup.c                          |    5 +-
 src/struct_ls/smg_setup_interp.c                   |    4 +-
 src/struct_ls/sparse_msg2_setup_rap.c              |   45 +-
 src/struct_ls/sparse_msg3_setup_rap.c              |   61 +-
 src/struct_ls/sparse_msg_filter.c                  |   49 +-
 src/struct_ls/sparse_msg_interp.c                  |   10 +-
 src/struct_ls/sparse_msg_restrict.c                |    8 +-
 src/struct_ls/sparse_msg_setup.c                   |   36 +-
 src/struct_mv/_hypre_struct_mv.h                   | 2919 ++++++++++++++++++--
 src/struct_mv/assumed_part.c                       |   14 +-
 src/struct_mv/box.h                                |  410 +--
 src/struct_mv/box_manager.c                        |   16 +-
 src/struct_mv/boxloop.h                            |  384 +++
 src/struct_mv/boxloop_cuda.h                       |  717 +++++
 src/struct_mv/boxloop_kokkos.h                     |  542 ++++
 src/struct_mv/boxloop_raja.h                       |  845 ++++++
 src/struct_mv/communication_info.c                 |   13 +-
 src/struct_mv/headers                              |   50 +-
 src/struct_mv/protos.h                             |    2 +-
 src/struct_mv/struct_axpy.c                        |   16 +-
 src/struct_mv/struct_communication.c               |  343 ++-
 src/struct_mv/struct_communication.h               |    5 +
 src/struct_mv/struct_copy.c                        |   10 +-
 src/struct_mv/struct_grid.c                        |    6 +-
 src/struct_mv/struct_innerprod.c                   |   42 +-
 src/struct_mv/struct_io.c                          |   46 +-
 src/struct_mv/struct_matrix.c                      |   40 +-
 src/struct_mv/struct_matvec.c                      |   64 +-
 src/struct_mv/struct_scale.c                       |    3 +-
 src/struct_mv/struct_vector.c                      |   80 +-
 src/test/Makefile                                  |    6 +
 src/test/TEST_examples/complex.jobs                |    2 +-
 src/test/TEST_examples/maxdim.jobs                 |    4 +-
 src/test/TEST_ij/smoother.jobs                     |   19 +
 src/test/TEST_ij/smoother.saved                    |   20 +
 src/test/TEST_ij/smoother.sh                       |    5 +
 src/test/TEST_ij/solvers.jobs                      |    3 +
 src/test/TEST_ij/solvers.saved                     |   12 +
 src/test/TEST_ij/solvers.sh                        |    3 +
 .../solvers_ij.jobs}                               |   64 +-
 src/test/TEST_longdouble/solvers_ij.saved          |  145 +
 .../solvers.sh => TEST_longdouble/solvers_ij.sh}   |    0
 src/test/TEST_longdouble/solvers_struct.jobs       |   63 +
 src/test/TEST_longdouble/solvers_struct.saved      |  120 +
 .../solvers_struct.sh}                             |   39 +-
 .../solvers.jobs => TEST_single/solvers_ij.jobs}   |   64 +-
 src/test/TEST_single/solvers_ij.saved              |  145 +
 .../solvers.sh => TEST_single/solvers_ij.sh}       |    7 +-
 src/test/TEST_single/solvers_struct.jobs           |   63 +
 src/test/TEST_single/solvers_struct.saved          |  120 +
 .../smoother.sh => TEST_single/solvers_struct.sh}  |   39 +-
 src/test/TEST_sstruct/solvers.saved                |   80 +-
 src/test/TEST_struct/solvers.saved                 |  120 +-
 src/test/ams_driver.c                              |    4 +-
 src/test/for_maxwell.c                             |    2 +-
 src/test/ij.c                                      |  892 +++---
 src/test/maxwell_unscaled.c                        |    5 +-
 src/test/runtest.sh                                |    2 +-
 src/test/sstruct.c                                 |   12 +-
 src/test/struct.c                                  |   68 +-
 src/test/struct_migrate.c                          |   24 +-
 src/test/struct_newboxloop.c                       | 1956 +++++++++++++
 src/test/zboxloop.c                                |   30 +-
 src/utilities/HYPRE_utilities.h                    |   36 +-
 src/utilities/Makefile                             |    4 +-
 src/utilities/_hypre_utilities.h                   |  631 ++++-
 src/utilities/amg_linklist.h                       |    3 +-
 src/utilities/caliper_instrumentation.h            |    1 +
 src/utilities/exchange_data.h                      |    9 +-
 src/utilities/general.h                            |   11 +-
 src/utilities/{threading.h => gpgpu.h}             |   23 +-
 src/utilities/gpuErrorCheck.c                      |  111 +
 src/utilities/gpuErrorCheck.h                      |  153 +
 src/utilities/gpuMem.c                             |  513 ++++
 src/utilities/gpuMem.h                             |  104 +
 src/utilities/headers                              |   23 +-
 src/utilities/hypre_error.h                        |    3 +-
 src/utilities/hypre_memory.c                       |  244 +-
 src/utilities/hypre_memory.h                       |  185 ++
 src/utilities/hypre_nvtx.h                         |   72 +
 src/utilities/hypre_printf.c                       |   46 +-
 src/utilities/mpistubs.c                           |    2 +-
 src/utilities/mpistubs.h                           |   43 +-
 src/utilities/protos.h                             |  221 ++
 src/utilities/random.c                             |   53 +-
 src/utilities/threading.h                          |    1 +
 src/utilities/timing.c                             |    4 +-
 src/utilities/timing.h                             |    3 +-
 246 files changed, 16688 insertions(+), 4327 deletions(-)

diff --git a/AUTOTEST/README.txt b/AUTOTEST/README.txt
index d7131ee..3089bad 100644
--- a/AUTOTEST/README.txt
+++ b/AUTOTEST/README.txt
@@ -1,12 +1,10 @@
 
 This directory contains scripts for running various tests on the hypre library.
-They are run automatically as part of hypre's regression testing, and they are
-run manually to test new distributions of hypre before releasing them to the
-public.  The scripts augment the 'runtest.sh' runtime tests in 'test/TEST_*'.
+The scripts augment the 'runtest.sh' runtime tests in 'test/TEST_*'.
 
-Every test in this directory may be run manually by developers without fear of
-interfering with the auto-testing, as long as they are not run from within the
-auto-testing directory (currently '/usr/casc/hypre/testing').
+Every test in this directory may be run manually by developers.  Many of the
+scripts are also run as part of the nightly regression testing, currently
+developed and maintained in a separate git repository called 'hypre/autotest'.
 
 =====================
 
@@ -17,20 +15,13 @@ files with a '.sh' extension).  Except for a few "special scripts" (below), each
 represents an individual test written by a hypre developer.  The special scripts
 are as follows (note that they are the only scripts with "test" in their names):
 
-1. 'test.sh' - Used to run individual tests locally on a machine.
-2. 'testsrc.sh' - Used to run individual tests on a remote machine.
-3. 'testdist.sh' - Used to test a new distribution before release.
-4. 'autotest.sh' - Usually run in an automatic fashion by 'cron', but may also
-                   be run manually by developers (useful for debugging).
+1. 'test.sh'       - Used to run individual tests.
+2. 'cleantest.sh'  - Used to clean up the output from a test (or tests).
+3. 'renametest.sh' - Used to rename the output from a test.
 
 Usage information for every script (special or individual test) can be obtained
 by running it with the '-h' option (e.g., 'test.sh -h' or 'make.sh -h').
 
-The file 'cronfile' encapsulates the current 'cron' entries for auto-testing.
-It is possible (and probable) to have multiple developers running 'cron' jobs as
-part of the overall auto-testing.  This needs to be coordinated if the output
-files are being written to the global auto-testing directory.
-
 =====================
 
 Writing tests:
@@ -43,8 +34,7 @@ To write a new test, just use an existing test (e.g., 'default.sh') as a
 template and make the appropriate modifications.  Try not to use the word "test"
 in the name of the script so that we can keep the convention of only the special
 scripts having this in their names.  Try not to use absolute directory paths in
-the script.  If in doubt, talk to another developer or send an inquiry to
-hypre-support at llnl.gov.
+the script.
 
 =====================
 
@@ -52,7 +42,6 @@ Design goals:
 
 - Minimal limitations on the types of tests that are possible.
 - Developers should be able to run the tests manually.
-- Tests should be runable on both the repository and each release.
 - Minimal dependence on operating system and software tools (for portability).
 - Developers should be able to easily add new tests.
 - Simplicity and flexibility.
diff --git a/AUTOTEST/autotest.sh b/AUTOTEST/autotest.sh
deleted file mode 100755
index 49cd6ce..0000000
--- a/AUTOTEST/autotest.sh
+++ /dev/null
@@ -1,283 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-# Setup
-testing_dir=`cd ..; pwd`
-autotest_dir="$testing_dir/AUTOTEST"
-finished_dir="$testing_dir/AUTOTEST-FINISHED"
-output_dir="$testing_dir/AUTOTEST-`date +%Y.%m.%d-%a`"
-src_dir="$testing_dir/hypre/src"
-remote_dir="test-hypre"
-summary_file="SUMMARY.html"
-summary_subject="Autotest Error Summary `date +%Y-%m-%d`"
-email_list="rfalgout at llnl.gov, tzanio at llnl.gov, umyang at llnl.gov, schroder2 at llnl.gov, oseikuffuor1 at llnl.gov, wang84 at llnl.gov, li50 at llnl.gov"
-
-# Main loop
-test_opts=""
-while [ "$*" ]
-do
-   case $1 in
-      -h|-help)
-         cat <<EOF
-
-   $0 [options] [-checkout | -dist M.mm.rr | -{test1} ... | -summary]
-
-   where:
-
-      -checkout           Checks out the repository and updates the current AUTOTEST
-                          directory.  Should be called before running tests.
-      -dist               Use the hypre release M.mm.rr (e.g. 2.4.0b). This is an
-                          alternative to -checkout and is used by testdist.sh.
-      -{test}             Runs the indicated tests in sequence, which are associated
-                          with specific machine names (e.g., -tux149, -alc, -up).
-      -summary            Generates a summary file of passed, pending, failed tests.
-      -summary-email      Same as -summary, but also sends developers an email.
-      -summary-copy {dir} Same as -summary, but also copies to remote test {dir}.
-
-   with options:
-
-      -h|-help       prints this usage information and exits
-      -t|-trace      echo each command
-
-   The main purpose of this script is to organize the automatic testing process
-   and to ensure that all related files have the appropriate permissions.
-
-   Example usage: $0 -checkout
-                  $0 -tux149 -alc
-                  $0 -summary
-                  $0 -summary-copy tux149:/usr/casc/hypre/testing
-
-EOF
-         exit
-         ;;
-
-      -t|-trace)
-         set -xv
-         shift
-         ;;
-
-      # Checkout the repository and update the global AUTOTEST directory
-      -checkout)
-         cd $testing_dir
-         if [ ! -d hypre ]; then
-            echo "Clone the hypre directory in $testing_dir first"
-            exit
-         else
-            cd hypre; git checkout .; git pull; cd ..
-         fi
-         trap "cp -fR $testing_dir/hypre/AUTOTEST $testing_dir" EXIT
-         test_opts=""
-         break
-         ;;
-
-     -dist)
-         shift
-         finished_dir="$testing_dir/AUTOTEST-hypre-$1"
-         src_dir="$testing_dir/hypre-$1/src"
-         remote_dir="test-hypre-$1"
-         shift
-         ;;
-
-      # Generate a summary file in the output directory
-      -summary*)
-         # move the finished logs to todays output directory
-         # (using 'cp' then 'rm' produces fewer complaints than using 'mv')
-         # (the autotest-* files are removed below if not pending)
-         # (check first that the files exist to reduce error messages from 'cp')
-         mkdir -p $output_dir
-         count=$( find $finished_dir -mindepth 1 -name "*" | wc -m )
-         if [ $count -ne 0 ]; then
-            cp -fr $finished_dir/* $output_dir
-            rm -fr $finished_dir/*
-         fi
-         count=$( find $autotest_dir -mindepth 1 -name "autotest-*" | wc -m )
-         if [ $count -ne 0 ]; then
-            cp -f  $autotest_dir/autotest-* $output_dir
-         fi
-
-         cd $output_dir
-         echo "<html>"          > $summary_file;
-         echo "<head> </head>" >> $summary_file;
-         echo "<PRE>"          >> $summary_file;
-         echo $summary_subject >> $summary_file
-
-
-         # all top-level tests with empty error files are reported as "passed",
-         # not including the cron autotest logs
-         echo ""         >> $summary_file;
-         echo "[PASSED]" >> $summary_file
-         for test in $( find . -maxdepth 1 -size 0 -name "*.err" ! -name "*cron*" )
-         do
-            testname=`basename $test .err`
-            echo "-${testname#machine-}" >> $summary_file
-         done
-
-         # active tests without a *-done file are reported as "pending"
-         echo ""          >> $summary_file;
-         echo "[PENDING]" >> $summary_file
-         for test in $( find . -name "autotest-*-start" )
-         do
-            testbase=`basename $test -start`
-            if [ ! -e $testbase-done ]; then
-               echo $testbase | sed {s/autotest//g} >> $output_dir/$summary_file
-            else
-               rm -f $autotest_dir/$testbase*
-            fi
-         done
-
-         # all top-level tests with non-empty error files are reported as "failed",
-         # including the cron autotest logs
-         echo ""         >> $summary_file;
-         echo "[FAILED]" >> $summary_file
-         for test in $( find . -maxdepth 1 ! -size 0 -name "*.err" )
-         do
-            testname=`basename $test .err`
-            for prefix in "machine-" "autotest-";
-            do
-               testname="${testname#$prefix}"
-            done
-            echo "-$testname" >> $summary_file
-         done
-
-         # keep a time stamp of last runs and report if more than 10 days
-         echo ""           >> $summary_file;
-         echo "[LAST RUN]" >> $summary_file
-         for test in $( find . -maxdepth 1 -name "autotest-*-done" )
-         do
-            testname=`basename $test -done`
-            testname="${testname#autotest-}"
-            touch $testing_dir/lastrun-$testname
-         done
-         for test in $( find $testing_dir -maxdepth 1 -name "lastrun-*" -atime +10 )
-         do
-            testdate=`ls -l $test | awk '{print $6" "$7" "$8}'`
-            testname=`basename $test`
-            testname="${testname#lastrun-}"
-            echo "-$testname  $testdate" >> $summary_file
-         done
-
-         # list all non-empty error files in todays output directory
-         echo ""              >> $summary_file;
-         echo "[ERROR FILES]" >> $summary_file
-         for test in $( find $output_dir ! -size 0 -name "*.err" | sort -r )
-         do
-            echo "<a href=\"file://$test\">$test</a>" >> $summary_file
-         done
-
-         echo "</PRE>"  >> $summary_file;
-         echo "</html>" >> $summary_file;
-
-         if [ "$1" = "-summary-email" ]; then
-            # send the email
-            (
-               echo To: $email_list
-               echo Subject: $summary_subject
-               echo Content-Type: text/html
-               echo MIME-Version: 1.0
-
-               cat $summary_file
-
-            ) | /usr/sbin/sendmail -t
-         fi
-
-         if [ "$1" = "-summary-copy" ]; then
-            # copy output_dir files to the specified remote testing_dir
-            rem_finished_dir="$2/AUTOTEST-FINISHED"
-            scp -q -r * $rem_finished_dir
-         fi
-
-         test_opts=""
-         break
-         ;;
-
-      *)
-         test_opts="$test_opts $1"
-         shift
-         ;;
-   esac
-done
-
-# Ensure that important directories exist
-if [ -n "$test_opts" ]; then
-   cd $testing_dir
-   mkdir -p $autotest_dir
-   mkdir -p $finished_dir
-   cd $autotest_dir
-fi
-
-# Run tests
-for opt in $test_opts
-do
-   # TODO: use a "-<testname>:<hostname>" format to avoid this?
-   case $opt in
-      -tux[0-9]*-compilers)
-         host=`echo $opt | awk -F- '{print $2}'`
-         name="tux-compilers"
-         ;;
-
-      -tux[0-9]*)
-         host=`echo $opt | awk -F- '{print $2}'`
-         name="tux"
-         ;;
-
-      -mac)
-         host="parsol"
-         name="mac"
-         ;;
-
-      *)
-         host=`echo $opt | awk -F- '{print $2}'`
-         name=$host
-         ;;
-   esac
-
-   if [ ! -e autotest-$name-start ]; then
-      echo "Test [machine-$name] started at  `date +%T` on `date +%D`" \
-         >> autotest-$name-start
-      ./testsrc.sh $src_dir $host:$remote_dir/$host machine-$name.sh
-      echo "Test [machine-$name] finished at `date +%T` on `date +%D`" \
-         >> autotest-$name-start
-      mv machine-$name.??? $finished_dir
-      touch autotest-$name-done
-   fi
-done
-
-# Fix permissions
-cd $testing_dir
-ch_dirs="hypre $autotest_dir $finished_dir $output_dir"
-for dir in $ch_dirs lastrun-*
-do
-   if [ -e $dir ]; then
-      chmod -fR a+rX,ug+w,o-w $dir
-      # chgrp -fR hypre         $dir
-   fi
-done
-
-# move all but the last 10 autotest results into yearly subdirectories
-files=`echo AUTOTEST-2*.*`
-count=`echo $files | wc | awk '{print $2}'`
-for i in $files
-do
-   if [ $count -le 10 ]; then
-      break;
-   fi
-   dir=`echo $i | awk -F '.' '{print $1}'`
-   if [ ! -d $dir ]; then
-      mkdir $dir
-      chmod -fR a+rX,ug+w,o-w $dir
-      # chgrp -fR hypre         $dir
-   fi
-   mv $i $dir/$i
-   count=`expr $count - 1`
-done
-
diff --git a/AUTOTEST/basictest.sh b/AUTOTEST/basic.sh
similarity index 100%
rename from AUTOTEST/basictest.sh
rename to AUTOTEST/basic.sh
diff --git a/AUTOTEST/check-double.filters b/AUTOTEST/check-double.filters
index 6825658..284fb05 100644
--- a/AUTOTEST/check-double.filters
+++ b/AUTOTEST/check-double.filters
@@ -1,4 +1,11 @@
 /_hypre_utilities.h:
 /HYPRE_utilities.h:
-/mpistubs.c
+/utilities/general.h:
+/utilities/gpuErrorCheck.h:
+/utilities/gpuErrorCheck.c:
+/utilities/gpuMem.h:
+/utilities/hypre_nvtx.h:
+/utilities/mpistubs.c
+/seq_mv/gpukernels.h:
+/seq_mv/seq_mv.h:.*cudaStream_t
 double-check
diff --git a/AUTOTEST/check-int.filters b/AUTOTEST/check-int.filters
index ce8dc96..db59bbc 100644
--- a/AUTOTEST/check-int.filters
+++ b/AUTOTEST/check-int.filters
@@ -3,6 +3,14 @@
 /hypre_printf.c:
 /_hypre_utilities.h:
 /HYPRE_utilities.h:
+/utilities/general.h:
+/utilities/gpuErrorCheck.h:
+/utilities/gpuErrorCheck.c:
+/utilities/gpuMem.h:
+/utilities/hypre_nvtx.h:
+/utilities/mpistubs.c
+/seq_mv/gpukernels.h:
+/seq_mv/seq_mv.h:.*cudaStream_t
 as long as
 too long
 long range interpolation
diff --git a/AUTOTEST/check-mpi.filters b/AUTOTEST/check-mpi.filters
index 05f9bdc..002d8ac 100644
--- a/AUTOTEST/check-mpi.filters
+++ b/AUTOTEST/check-mpi.filters
@@ -1,7 +1,10 @@
 /HYPRE_config.h:
 /_hypre_utilities.h:
 /HYPRE_utilities.h:
-/mpistubs.c:
-/mpistubs.h:
-/thread_mpistubs.c:
-/thread_mpistubs.h:
+/utilities/gpuMem.c:
+/utilities/mpistubs.c:
+/utilities/mpistubs.h:
+/parcsr_mv/par_csr_matvec.c:.*MPI_PACK
+/parcsr_mv/par_csr_matvec.c:.*MPI_HALO_EXC_SEND
+/parcsr_mv/par_csr_matvec.c:.*MPI_HALO_EXC_RECV
+/parcsr_mv/par_csr_matvec.c:.*MPI_UNPACK
diff --git a/AUTOTEST/cmaketest.sh b/AUTOTEST/cmake.sh
similarity index 98%
rename from AUTOTEST/cmaketest.sh
rename to AUTOTEST/cmake.sh
index 699be44..9806bd4 100755
--- a/AUTOTEST/cmaketest.sh
+++ b/AUTOTEST/cmake.sh
@@ -123,5 +123,5 @@ cd $src_dir
 rm -fr `echo cmbuild/* | sed 's/[^ ]*README.txt//g'`
 rm -fr `echo test/cmbuild/* | sed 's/[^ ]*README.txt//g'`
 rm -fr hypre
-( cd $src_dir/test; rm -f $drivers; cleantest.sh )
+( cd $src_dir/test; rm -f $drivers; ./cleantest.sh )
 
diff --git a/AUTOTEST/cronfile b/AUTOTEST/cronfile
deleted file mode 100644
index 91ccc1a..0000000
--- a/AUTOTEST/cronfile
+++ /dev/null
@@ -1,29 +0,0 @@
-# The first five entries on each line correspond respectively to:
-#
-# minute (0-56)
-# hour (0-23)
-# day of month (1-31)
-# month (1-12)
-# day of week (0-6)(0=Sunday)
-#
-# '*' means "every" and '*/m' means "every m-th"
-
-# Rob's crontab (on tux339)
-
-30 23 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -checkout >> autotest-tux-cron.out 2>> autotest-tux-cron.err
-00  1 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -tux339   >> autotest-tux-cron.out 2>> autotest-tux-cron.err
-00  6 * * * source /etc/profile; source $HOME/.bashrc; cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -summary-email
-
-
-# Rob's crontab (on rzcereal2)
-
-40 0 * * * source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -checkout   > autotest-rzcereal2-checkout.out 2>&1
-00 1 * * 2 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -vulcan    >> autotest-vulcan-cron.out    2>> autotest-vulcan-cron.err
-00 1 * * 3 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzmerl    >> autotest-rzmerl-cron.out    2>> autotest-rzmerl-cron.err
-00 3 * * 3 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzcereal3 >> autotest-rzcereal3-cron.out 2>> autotest-rzcereal3-cron.err
-00 1 * * 4 source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -rzzeus    >> autotest-rzzeus-cron.out    2>> autotest-rzzeus-cron.err
-00 5 * * * source /etc/profile; source $HOME/.profile; cd $HOME/test-hypre/AUTOTEST; ./autotest.sh -summary-copy tux339:/usr/casc/hypre/test-hypre
-
-# Tzanio's crontab (on tux252)
-
-00 1 * * * cd /usr/casc/hypre/test-hypre/AUTOTEST; ./autotest.sh -mac >> autotest-mac-cron.out 2>> autotest-mac-cron.err
diff --git a/AUTOTEST/machine-mac.sh b/AUTOTEST/machine-mac.sh
index 64e8163..5d83198 100755
--- a/AUTOTEST/machine-mac.sh
+++ b/AUTOTEST/machine-mac.sh
@@ -51,16 +51,16 @@ mo="test"
 ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
 
 co="--disable-fortran"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
 
 co="--enable-debug --disable-fortran"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
 
 co="--enable-bigint --disable-fortran"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
 
 # Test linking for different languages
 link_opts="all++"
diff --git a/AUTOTEST/machine-rzmerl.sh b/AUTOTEST/machine-rztopaz.sh
similarity index 73%
rename from AUTOTEST/machine-rzmerl.sh
rename to AUTOTEST/machine-rztopaz.sh
index b6df2fc..a4782ec 100755
--- a/AUTOTEST/machine-rzmerl.sh
+++ b/AUTOTEST/machine-rztopaz.sh
@@ -18,14 +18,14 @@ case $1 in
    -h|-help)
       cat <<EOF
 
-   **** Only run this script on the rzmerl machine ****
+   **** Only run this script on the rztopaz machine ****
 
    $0 [-h|-help] {src_dir}
 
    where: -h|-help   prints this usage information and exits
           {src_dir}  is the hypre source directory
 
-   This script runs a number of tests suitable for the rzmerl machine.
+   This script runs a number of tests suitable for the rztopaz machine.
 
    Example usage: $0 ../src
 
@@ -47,21 +47,21 @@ mo="test"
 ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
 
 co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
 
 co="--with-openmp"
 RO="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED -nthreads 2"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $RO
-renametest.sh basictest $output_dir/basictest--with-openmp
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $RO
+./renametest.sh basic $output_dir/basic--with-openmp
 
 co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
 
 co="--enable-bigint"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
 
 # Test linking for different languages
 link_opts="all++ all77"
diff --git a/AUTOTEST/machine-rzzeus.sh b/AUTOTEST/machine-syrah.sh
similarity index 74%
rename from AUTOTEST/machine-rzzeus.sh
rename to AUTOTEST/machine-syrah.sh
index 54def77..7067590 100755
--- a/AUTOTEST/machine-rzzeus.sh
+++ b/AUTOTEST/machine-syrah.sh
@@ -18,14 +18,14 @@ case $1 in
    -h|-help)
       cat <<EOF
 
-   **** Only run this script on the zeus cluster ****
+   **** Only run this script on the syrah cluster ****
 
    $0 [-h|-help] {src_dir}
 
    where: -h|-help   prints this usage information and exits
           {src_dir}  is the hypre source directory
 
-   This script runs a number of tests suitable for the zeus cluster.
+   This script runs a number of tests suitable for the syrah cluster.
 
    Example usage: $0 ../src
 
@@ -47,20 +47,20 @@ mo="test"
 ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
 
 co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
 
 co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
 
 co="--enable-bigint"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
 
 co="--with-blas --with-lapack --with-blas-lib-dirs=/usr/lib64 --with-lapack-lib-dirs=/usr/lib64 --with-blas-libs=blas --with-lapack-libs=lapack"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--with-blas
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--with-blas
 
 # Test linking for different languages
 link_opts="all++ all77"
diff --git a/AUTOTEST/machine-tux.sh b/AUTOTEST/machine-tux.sh
index a2c82f1..3bf7d6e 100755
--- a/AUTOTEST/machine-tux.sh
+++ b/AUTOTEST/machine-tux.sh
@@ -49,67 +49,75 @@ ro="-ams -ij -sstruct -struct"
 eo=""
 
 co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic-default
 
 co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: $eo
-renametest.sh basictest $output_dir/basictest-debug1
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: $eo
+./renametest.sh basic $output_dir/basic-debug1
 
 co="--enable-debug --enable-global-partition"
 RO="-fac"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $RO -eo: $eo
-renametest.sh basictest $output_dir/basictest-debug2
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $RO -eo: $eo
+./renametest.sh basic $output_dir/basic-debug2
 
 co="--enable-debug CC=mpiCC"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
-renametest.sh basictest $output_dir/basictest-debug-cpp
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
+./renametest.sh basic $output_dir/basic-debug-cpp
 
 # co="--with-insure --enable-debug --with-print-errors"
 # MO="test"
-# test.sh basictest.sh $src_dir -co: $co -mo: $MO -ro: $ro
-# renametest.sh basictest $output_dir/basictest--with-insure1
+# ./test.sh basic.sh $src_dir -co: $co -mo: $MO -ro: $ro
+# ./renametest.sh basic $output_dir/basic--with-insure1
 # 
 # co="--with-insure --enable-debug --enable-global-partition"
 # MO="test"
-# test.sh basictest.sh $src_dir -co: $co -mo: $MO -ro: $ro
-# renametest.sh basictest $output_dir/basictest--with-insure2
+# ./test.sh basic.sh $src_dir -co: $co -mo: $MO -ro: $ro
+# ./renametest.sh basic $output_dir/basic--with-insure2
 
 co="--enable-debug --with-print-errors"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
-renametest.sh basictest $output_dir/basictest--valgrind1
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
+./renametest.sh basic $output_dir/basic--valgrind1
 
 co="--enable-debug --enable-global-partition"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
-renametest.sh basictest $output_dir/basictest--valgrind2
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -rt -valgrind
+./renametest.sh basic $output_dir/basic--valgrind2
 
 co="--without-MPI"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--without-MPI
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--without-MPI
 
 co="--with-strict-checking"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--with-strict-checking
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--with-strict-checking
 
 co="--enable-shared"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-shared
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-shared
 
 co="--enable-bigint --enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: -bigint
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: -bigint
+./renametest.sh basic $output_dir/basic--enable-bigint
+
+co="--enable-single --enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: -single
+./renametest.sh basic $output_dir/basic--enable-single
+
+co="--enable-longdouble --enable-debug"
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: -longdouble
+./renametest.sh basic $output_dir/basic--enable-longdouble
 
 co="--enable-maxdim=4 --enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: -maxdim
-renametest.sh basictest $output_dir/basictest--enable-maxdim=4
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: -maxdim
+./renametest.sh basic $output_dir/basic--enable-maxdim=4
 
 co="--enable-complex --enable-maxdim=4 --enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -eo: -complex
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -eo: -complex
 # ignore complex compiler output for now
-rm -fr basictest.dir/make.???
-grep -v make.err basictest.err > basictest.tmp
-mv basictest.tmp basictest.err
-renametest.sh basictest $output_dir/basictest--enable-complex
+rm -fr basic.dir/make.???
+grep -v make.err basic.err > basic.tmp
+mv basic.tmp basic.err
+./renametest.sh basic $output_dir/basic--enable-complex
 
 # CMake build and run tests
 mo="-j"
@@ -117,28 +125,36 @@ ro="-ams -ij -sstruct -struct"
 eo=""
 
 co=""
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-default
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-default
 
 co="-DCMAKE_BUILD_TYPE=Debug"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh cmaketest $output_dir/cmaketest-debug
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh cmake $output_dir/cmake-debug
 
 co="-DHYPRE_NO_GLOBAL_PARTITION=OFF"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-global-partition
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-global-partition
 
 co="-DHYPRE_SEQUENTIAL=ON"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-sequential
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-sequential
 
 co="-DHYPRE_SHARED=ON"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo
-renametest.sh cmaketest $output_dir/cmaketest-shared
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo
+./renametest.sh cmake $output_dir/cmake-shared
 
 co="-DHYPRE_BIGINT=ON"
-test.sh cmaketest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh cmaketest $output_dir/cmaketest-bigint
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh cmake $output_dir/cmake-bigint
+
+co="-DHYPRE_SINGLE=ON"
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: -single
+./renametest.sh cmake $output_dir/cmake-single
+
+co="-DHYPRE_LONG_DOUBLE=ON"
+./test.sh cmake.sh $src_dir -co: $co -mo: $mo -ro: -longdouble
+./renametest.sh cmake $output_dir/cmake-longdouble
 
 # cmake build doesn't currently support maxdim
 # cmake build doesn't currently support complex
diff --git a/AUTOTEST/machine-vulcan.sh b/AUTOTEST/machine-vulcan.sh
index dfe402c..ea5fef6 100755
--- a/AUTOTEST/machine-vulcan.sh
+++ b/AUTOTEST/machine-vulcan.sh
@@ -47,16 +47,16 @@ mo="test"
 ro="-ams -ij -sstruct -struct -rt -D HYPRE_NO_SAVED"
 
 co=""
-test.sh basictest.sh $src_dir -co: $co -mo: $mo -ro: $ro
-renametest.sh basictest $output_dir/basictest-default
+./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
+./renametest.sh basic $output_dir/basic-default
 
 co="--enable-debug"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-debug
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-debug
 
 co="--enable-bigint"
-test.sh basictest.sh $src_dir -co: $co -mo: $mo
-renametest.sh basictest $output_dir/basictest--enable-bigint
+./test.sh basic.sh $src_dir -co: $co -mo: $mo
+./renametest.sh basic $output_dir/basic--enable-bigint
 
 # Test linking for different languages
 link_opts="all++ all77"
diff --git a/AUTOTEST/testdist.sh b/AUTOTEST/testdist.sh
deleted file mode 100755
index 680aa12..0000000
--- a/AUTOTEST/testdist.sh
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-# Which tests to run?
-TEST_PATCH="-tux339"
-TEST_MINOR="$TEST_PATCH -rzzeus -rzmerl -vulcan"
-TEST_MAJOR="$TEST_MINOR"
-TERMCMD=""
-
-while [ "$*" ]
-do
-   case $1 in
-      -h|-help)
-         cat <<EOF
-
-   $0 [options] {release}
-
-   where: {release}  is a hypre release tar file (gzipped, absolute path)
-
-   with options:
-      -xterm         run the tests in parallel using multiple xterm windows
-      -h|-help       prints this usage information and exits
-      -t|-trace      echo each command
-
-   This script unpacks {release} in the parent directory and lists the tests
-   needed to verify it, based on the type of release (MAJOR, MINOR, or PATCH).
-   If all required tests pass, a verification file is generated containing the
-   logs from the runs.  Otherwise, tests that have failed or have not been run
-   yet can be started, and the script will have to be re-run after their
-   completion to generate the verification file.
-
-   Example usage: $0 /usr/casc/hypre/hypre-2.10.1.tar.gz
-
-   NOTE: The absolute path for the release is required.
-
-   NOTE: Because of ssh restrictions at LLNL, run this script on an LC machine.
-   You may need to change the default tux platform at the top of this file to
-   your own tux machine.  Finally, for each release tar file, it is recommended
-   that you run this script inside a separate copy of the AUTOTEST directory
-   (this will avoid result conflicts in common tests).
-
-EOF
-         exit
-         ;;
-      -t|-trace)
-         set -xv
-         shift
-         ;;
-      -xterm)
-         # Get the terminal command and make sure it runs bash
-         TERMCMD="$TERM -e"; SHELL=/bin/sh
-         shift
-         ;;
-      *)
-         break
-         ;;
-   esac
-done
-
-# Setup
-testing_dir=`cd ..; pwd`
-autotest_dir="$testing_dir/AUTOTEST"
-release_file=$1
-release_dir=`basename $release_file | awk -F.tar '{print $1}'`
-release=`echo $release_dir | sed 's/hypre-//' | sed 's/.tar.gz//'`
-output_dir="$testing_dir/AUTOTEST-hypre-$release"
-case $release in
-   [1-9][0-9]*.0.0)                     NAME="MAJOR"; TESTS=$TEST_MAJOR ;;
-   [1-9][0-9]*.[1-9][0-9]*.0)           NAME="MINOR"; TESTS=$TEST_MINOR ;;
-   [1-9][0-9]*.[1-9][0-9]*.[1-9][0-9]*) NAME="PATCH"; TESTS=$TEST_PATCH ;;
-   *)                                   NAME="PATCH"; TESTS=$TEST_PATCH ;;
-esac
-
-# Extract the release
-cd $testing_dir
-echo "Checking the distribution file..."
-tmpdir=$release_dir.TMP
-mkdir -p $tmpdir
-rm -rf $tmpdir/$release_dir
-tar -C $tmpdir -zxf $release_file
-if !(diff -r $release_dir $tmpdir/$release_dir 2>/dev/null 1>&2) then
-   rm -rf $release_dir $output_dir $autotest_dir/autotest-*
-   tar -zxf $release_file
-fi
-rm -rf $tmpdir
-echo ""
-echo "The following tests are needed to verify this $NAME release: $TESTS"
-echo ""
-
-# List the status of the required tests
-cd $autotest_dir
-NOTRUN=""
-FAILED=""
-PENDING=""
-for test in $TESTS
-do
-   name=`echo $test | sed 's/[0-9]//g'`
-   # Determine failed, pending, passed and tests that have not been run
-   if [ -f $output_dir/machine$name.err ]; then
-      if [ -s $output_dir/machine$name.err ]; then
-         status="[FAILED] "; FAILED="$FAILED $test"
-      else
-         status="[PASSED] ";
-      fi
-   elif [ ! -e autotest$name-start ]; then
-      status="[NOT RUN]"; NOTRUN="$NOTRUN $test"
-   elif [ ! -e autotest$name-done ]; then
-      status="[PENDING]"; PENDING="$PENDING $test"
-   else
-      status="[UNKNOWN]";
-   fi
-   if [ "$TERMCMD" == "" ]; then
-      echo "$status ./autotest.sh -dist $release $test"
-   else
-      echo "$status $TERMCMD ./autotest.sh -dist $release $test &"
-   fi
-done
-
-# If all tests have been run, create a tarball of the log files
-if [ "$NOTRUN$PENDING" == "" ]; then
-   echo ""; echo "Generating the verification file AUTOTEST-hypre-$release.tgz"
-   cd $testing_dir
-   mv -f $autotest_dir/autotest-* $output_dir
-   tar -zcf $autotest_dir/AUTOTEST-hypre-$release.tgz `basename $output_dir`
-fi
-
-# If all tests have passed, print a message and exit
-if [ "$NOTRUN$FAILED$PENDING" == "" ]; then
-   echo "The release is verified!"
-   exit
-fi
-
-cat <<EOF
-
-The release can not be automatically verified at this time because not all tests
-are listed as [PASSED].  You may choose to continue with the release anyway, but
-it is your responsibility to ensure that the test errors are acceptable.
-
-This script can start the remaining tests now.  Alternatively, you can run the
-above commands manually (or in a cron job). If you do this, make sure to examine
-the standart error of the autotest.sh script.
-
-EOF
-
-echo -n "Do you want to start the remaining tests? (yes,no) : "
-read -e RUN
-if [ "$RUN" == "yes" ]; then
-   for test in $FAILED $NOTRUN
-   do
-      name=`echo $test | sed 's/[0-9]//g'`
-      rm -rf $output_dir/machine$name.??? autotest$name*
-      if [ "$TERMCMD" == "" ]; then
-         echo "Running test [./autotest.sh -dist $release $test]"
-         ./autotest.sh -dist $release $test 2>> autotest$name.err
-      else
-         echo "Running test [$TERMCMD ./autotest.sh -dist $release $test &]"
-         $TERMCMD "./autotest.sh -dist $release $test 2>> autotest$name.err" 2>> autotest$name.err &
-      fi
-      echo ""
-   done
-fi
-echo ""
-echo "Re-run the script after tests have completed to verify the release."
-echo ""
diff --git a/AUTOTEST/testsrc.sh b/AUTOTEST/testsrc.sh
deleted file mode 100755
index cf95a06..0000000
--- a/AUTOTEST/testsrc.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/sh
-#BHEADER**********************************************************************
-# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
-# Produced at the Lawrence Livermore National Laboratory.
-# This file is part of HYPRE.  See file COPYRIGHT for details.
-#
-# HYPRE is free software; you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License (as published by the Free
-# Software Foundation) version 2.1 dated February 1999.
-#
-# $Revision$
-#EHEADER**********************************************************************
-
-while [ "$*" ]
-do
-   case $1 in
-      -h|-help)
-         cat <<EOF
-
-   $0 [options] {src_dir} {machine:rem_path} {testname}.sh
-
-   where: {src_dir}  is the hypre source directory
-          {machine}  is the name of the machine to run on
-          {rem_path} is the remote path where the {src_dir} directory
-                     will be copied
-          {testname} is the user-defined name for the test script
-
-   with options:
-      -h|-help       prints this usage information and exits
-      -t|-trace      echo each command
-
-   This script is a specialized version of 'test.sh' that runs script
-   {testname}.sh remotely on {machine}.  It is assumed that {testname}.sh takes
-   only one argument, which will be set to '..' on the remote machine.
-
-   The script first copies the {src_dir} directory into {machine:rem_path}, then
-   copies the current AUTOTEST script directory there (potentially overwriting
-   an already existing AUTOTEST directory).
-
-   The output is still collected locally in exactly the same way as 'test.sh'.
-
-   Example usage: $0 ../src tux149:. machine-tux.sh
-
-EOF
-         exit
-         ;;
-      -t|-trace)
-         set -xv
-         shift
-         ;;
-      *)
-         break
-         ;;
-   esac
-done
-
-# Setup
-src_dir=`cd $1; pwd`
-machine=`echo $2 | awk -F: '{print $1}'`
-rem_path=`echo $2 | awk -F: '{print $2}'`
-testname=`basename $3 .sh`
-rem_dir=`basename $src_dir`
-
-# Copy the source and AUTOTEST directories using rsync/tar+ssh
-# ssh $machine "rm -fr $rem_path/$rem_dir"
-# scp -r $src_dir $machine:$rem_path/$rem_dir
-# scp -r . $machine:$rem_path/$rem_dir/AUTOTEST
-echo "Copying sources to $machine"
-rem_dir_exists=`ssh -q $machine "(/bin/sh -c \"[ -d $rem_path/$rem_dir ] && echo \"yes\" || (mkdir -p $rem_path/$rem_dir; echo \"no\")\")"`
-if [ "$rem_dir_exists" == "no" ]
-then
-   tar -C `dirname $src_dir` -zcf - $rem_dir | ssh -q $machine tar -C $rem_path -zxf -
-else
-   rsync -zae "ssh -q" --delete $src_dir/ $USER@$machine:$rem_path/$rem_dir
-fi
-rsync -zae "ssh -q" --delete . $USER@$machine:$rem_path/$rem_dir/AUTOTEST
-
-# Run the test and copy the results
-# Use the '.hyprerc' file when needed to customize the environment
-hyprerc_exists=`ssh -q $machine "( /bin/sh -c '[ -f .hyprerc ] && echo yes' )"`
-if [ "$hyprerc_exists" == "yes" ]
-then
-   ssh -q $machine "source .hyprerc; cd $rem_path/$rem_dir/AUTOTEST; ./test.sh ${testname}.sh .."
-else
-   ssh -q $machine "cd $rem_path/$rem_dir/AUTOTEST; ./test.sh ${testname}.sh .."
-fi
-rm -fr $testname.???
-echo "Copying output files from $machine"
-scp -q -r $machine:$rem_path/$rem_dir/AUTOTEST/$testname.\?\?\? .
diff --git a/CHANGELOG b/CHANGELOG
index b659fda..c0938c3 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,6 +15,20 @@
 # recent release.
 #=============================================================================
 
+Version 2.12.1 released 2017/09/29
+
+- Added support for single and quad precision floating point numbers
+
+- Added weighted Jacobi (relax_type 7) to be usable on GPU in BoomerAMG
+
+- Various bug fixes
+
+Version 2.12.0 released 2017/05/02
+
+- Added GPU support to hypre.  The Struct and SStruct code can use CUDA, RAJA,
+  or KOKKOS.  The ParCSR code uses CUDA.  The BoomerAMG setup phase is not yet
+  ported to the GPU.  This release uses unified memory.
+
 Version 2.11.2 released 2017/03/13
 
 - Changed the defaults in hypre to HMIS with ext+i(4) interpolation
diff --git a/docs/HYPRE_ref_manual.pdf b/docs/HYPRE_ref_manual.pdf
index 78de12f..774bb97 100644
Binary files a/docs/HYPRE_ref_manual.pdf and b/docs/HYPRE_ref_manual.pdf differ
diff --git a/docs/HYPRE_usr_manual.pdf b/docs/HYPRE_usr_manual.pdf
index a21b49e..c136630 100644
Binary files a/docs/HYPRE_usr_manual.pdf and b/docs/HYPRE_usr_manual.pdf differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 782dd52..c693ce0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.8)
 project (hypre)
 
 # The version number.
-set (HYPRE_VERSION 2.11.2)
-set (HYPRE_DATE    2017/03/13)
+set (HYPRE_VERSION 2.12.1)
+set (HYPRE_DATE    2017/09/29)
 set (HYPRE_TIME    00:00:00)
 set (HYPRE_BUGS    hypre-support at llnl.gov)
 set (HYPRE_SRCDIR  "${PROJECT_SOURCE_DIR}")
@@ -25,6 +25,8 @@ set (CMAKE_BUILD_TYPE "${HYPRE_BUILD_TYPE}" CACHE INTERNAL "" FORCE)
 # Configuration options
 option(HYPRE_SHARED               "Build a shared library" OFF)
 option(HYPRE_BIGINT               "Use long long int for HYPRE_Int" OFF)
+option(HYPRE_SINGLE               "Use float for HYPRE_Real" OFF)
+option(HYPRE_LONG_DOUBLE          "Use long double for HYPRE_Real" OFF)
 option(HYPRE_SEQUENTIAL           "Compile without MPI" OFF)
 option(HYPRE_TIMING               "Use HYPRE timing routines" OFF)
 option(HYPRE_USING_HYPRE_BLAS     "Use internal BLAS library" ON)
@@ -44,6 +46,14 @@ if (HYPRE_BIGINT)
   set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE)
 endif ()
 
+if (HYPRE_SINGLE)
+  set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE)
+endif ()
+
+if (HYPRE_LONG_DOUBLE)
+  set (HYPRE_USING_FEI OFF CACHE BOOL "" FORCE)
+endif ()
+
 if (HYPRE_SEQUENTIAL)
   set (HYPRE_NO_GLOBAL_PARTITION OFF CACHE BOOL "" FORCE)
 endif ()
@@ -470,6 +480,7 @@ list (APPEND HYPRE_SOURCES
   parcsr_ls/par_cg_relax_wt.c
   parcsr_ls/par_coarsen.c
   parcsr_ls/par_cgc_coarsen.c
+  parcsr_ls/par_cheby.c
   parcsr_ls/par_coarse_parms.c
   parcsr_ls/par_coordinates.c
   parcsr_ls/par_cr.c
@@ -491,6 +502,7 @@ list (APPEND HYPRE_SOURCES
   parcsr_ls/par_rap_communication.c
   parcsr_ls/par_rotate_7pt.c
   parcsr_ls/par_vardifconv.c
+  parcsr_ls/par_vardifconv_rs.c
   parcsr_ls/par_relax.c
   parcsr_ls/par_relax_more.c
   parcsr_ls/par_relax_interface.c
diff --git a/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx b/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx
index 97e186a..da3f565 100644
--- a/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx
+++ b/src/FEI_mv/fei-hypre/HYPRE_LSC_aux.cxx
@@ -4615,11 +4615,12 @@ void HYPRE_LinSysCore::solveUsingBoomeramg(int& status)
 
 double HYPRE_LinSysCore::solveUsingSuperLU(int& status)
 {
+  double             rnorm=-1.0;
 #ifdef HAVE_SUPERLU
    int                i, nnz, nrows, ierr;
    int                rowSize, *colInd, *new_ia, *new_ja, *ind_array;
    int                nz_ptr, *partition, start_row, end_row;
-   double             *colVal, *new_a, rnorm=-1.0;
+   double             *colVal, *new_a;
    HYPRE_ParCSRMatrix A_csr;
    HYPRE_ParVector    r_csr;
    HYPRE_ParVector    b_csr;
@@ -4793,12 +4794,13 @@ double HYPRE_LinSysCore::solveUsingSuperLU(int& status)
 
 double HYPRE_LinSysCore::solveUsingSuperLUX(int& status)
 {
+   double             rnorm=-1.0;
 #ifdef HAVE_SUPERLU
    int                i, nnz, nrows, ierr;
    int                rowSize, *colInd, *new_ia, *new_ja, *ind_array;
    int                nz_ptr;
    int                *partition, start_row, end_row;
-   double             *colVal, *new_a, rnorm=-1.0;
+   double             *colVal, *new_a;
    HYPRE_ParCSRMatrix A_csr;
    HYPRE_ParVector    r_csr;
    HYPRE_ParVector    b_csr;
diff --git a/src/FEI_mv/fei-hypre/Makefile b/src/FEI_mv/fei-hypre/Makefile
index 5a82ce0..9ca920d 100644
--- a/src/FEI_mv/fei-hypre/Makefile
+++ b/src/FEI_mv/fei-hypre/Makefile
@@ -20,8 +20,8 @@ SUPERLU_LIB     = -L$(srcdir)/../SuperLU
 
 CINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include
 CXXINCLUDES=${INCLUDES} ${MPIINCLUDE} -I../ml/src/Include
-CDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DHAVE_MLI
-CXXDEFS = -DHAVE_SUPERLU -DBOOL_NOT_SUPPORTED -DMPICH_SKIP_MPICXX -DHAVE_MLI
+CDEFS = -DBOOL_NOT_SUPPORTED ${HYPRE_FEI_CDEFS}
+CXXDEFS = ${CDEFS} -DMPICH_SKIP_MPICXX
 
 C_COMPILE_FLAGS = \
  ${CDEFS}\
diff --git a/src/IJ_mv/IJVector_parcsr.c b/src/IJ_mv/IJVector_parcsr.c
index a895e05..367bb24 100644
--- a/src/IJ_mv/IJVector_parcsr.c
+++ b/src/IJ_mv/IJVector_parcsr.c
@@ -858,7 +858,7 @@ hypre_IJVectorAssembleOffProcValsPar( hypre_IJVector *vector,
    			     	      HYPRE_Complex  *off_proc_data)
 {
    MPI_Comm comm = hypre_IJVectorComm(vector);
-   hypre_ParVector *par_vector = hypre_IJVectorObject(vector);
+   hypre_ParVector *par_vector = ( hypre_ParVector *) hypre_IJVectorObject(vector);
    hypre_MPI_Request *requests = NULL;
    hypre_MPI_Status *status = NULL;
    HYPRE_Int i, j, j2, row;
@@ -1315,7 +1315,8 @@ hypre_IJVectorAssembleOffProcValsPar( hypre_IJVector *vector,
       ex_contact_vec_starts[i+1] = -storage-1; /* need negative for next loop */
    }      
 
-   void_contact_buf = hypre_MAlloc(storage*obj_size_bytes);
+   /*void_contact_buf = hypre_MAlloc(storage*obj_size_bytes);*/
+   void_contact_buf = hypre_CAlloc(storage,obj_size_bytes);
    index_ptr = void_contact_buf; /* step through with this index */
 
    /* set up data to be sent to send procs */
diff --git a/src/Makefile b/src/Makefile
index cba466f..48d998e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,6 +10,7 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+default: all
 
 # Include all variables defined by configure
 include config/Makefile.config
diff --git a/src/blas/hypre_blas.h b/src/blas/hypre_blas.h
index baa3791..adffb04 100644
--- a/src/blas/hypre_blas.h
+++ b/src/blas/hypre_blas.h
@@ -50,8 +50,8 @@ extern "C" {
 #define s_copy   hypre_F90_NAME_BLAS(s_copy,S_COPY)
 
 /* blas_utils.c */
-logical lsame_ ( const char *ca ,const char *cb );
-HYPRE_Int xerbla_ ( const char *srname , integer *info );
+logical hypre_lsame_ ( const char *ca ,const char *cb );
+HYPRE_Int hypre_xerbla_ ( const char *srname , integer *info );
 integer s_cmp ( char *a0 , const char *b0 , ftnlen la , ftnlen lb );
 VOID s_copy ( char *a , const char *b , ftnlen la , ftnlen lb );
 
diff --git a/src/config/HYPRE_config.h.cmake.in b/src/config/HYPRE_config.h.cmake.in
index 3985e80..3c6020c 100644
--- a/src/config/HYPRE_config.h.cmake.in
+++ b/src/config/HYPRE_config.h.cmake.in
@@ -19,6 +19,12 @@
 /* Use long long int for HYPRE_Int */
 #cmakedefine HYPRE_BIGINT
 
+/* Use single precision values for HYPRE_Real */
+#cmakedefine HYPRE_SINGLE
+
+/* Use quad precision values for HYPRE_Real */
+#cmakedefine HYPRE_LONG_DOUBLE
+
 /* Use complex values */
 #cmakedefine HYPRE_COMPLEX
 
diff --git a/src/config/HYPRE_config.h.in b/src/config/HYPRE_config.h.in
index a7df6ef..f60473d 100644
--- a/src/config/HYPRE_config.h.in
+++ b/src/config/HYPRE_config.h.in
@@ -49,6 +49,12 @@
 /* Define to 1 if using long long int for HYPRE_Int */
 #undef HYPRE_BIGINT
 
+/* Define to 1 if using single precision values for HYPRE_Real */
+#undef HYPRE_SINGLE
+
+/* Define to 1 if using quad precision values for HYPRE_Real */
+#undef HYPRE_LONG_DOUBLE
+
 /* Define to 1 if using complex values */
 #undef HYPRE_COMPLEX
 
@@ -115,5 +121,8 @@
 /* As HYPRE_FC_FUNC, but for C identifiers containing underscores. */
 #undef FC_FUNC_
 
+/* Define to 1 if nvcc is enabled */
+#undef HYPRE_USING_NVCC
+
 /* Define to 1 if Caliper instrumentation is enabled */
 #undef HYPRE_USING_CALIPER
diff --git a/src/config/Makefile.config.in b/src/config/Makefile.config.in
index 4ceb255..61a793c 100644
--- a/src/config/Makefile.config.in
+++ b/src/config/Makefile.config.in
@@ -42,7 +42,7 @@ HYPRE_INC_INSTALL = @HYPRE_INCINSTALL@
 HYPRE_LIB_SUFFIX = @HYPRE_LIBSUFFIX@
 
 .SUFFIXES:
-.SUFFIXES: .o .f .c .C .cxx .cc
+.SUFFIXES: .o .f .c .C .cxx .cc .cu
 
 .f.o:
 	$(FC) $(FFLAGS) -c $< 
@@ -54,6 +54,8 @@ HYPRE_LIB_SUFFIX = @HYPRE_LIBSUFFIX@
 	$(CXX) $(CXXFLAGS) -c $<
 .cc.o:
 	$(CXX) $(CXXFLAGS) -c $<
+.cu.o:
+	$(NVCC) $(NVCCFLAGS) -c $<
 
 FC       = @FC@
 FFLAGS   = @FFLAGS@ @FCFLAGS@ $(FC_COMPILE_FLAGS)
@@ -64,6 +66,10 @@ CFLAGS   = @CFLAGS@ @DEFS@ $(C_COMPILE_FLAGS)
 CXX      = @CXX@
 CXXFLAGS = @CXXFLAGS@ @DEFS@ $(CXX_COMPILE_FLAGS)
 
+NVCC 	  = nvcc
+NVCCFLAGS = @NVCCFLAGS@
+NVCCLIBS  = @NVCCLIBS@
+
 LINK_FC  = @LINK_FC@
 LINK_CC  = @LINK_CC@
 LINK_CXX = @LINK_CXX@
@@ -85,10 +91,10 @@ AR     = @AR@
 RANLIB = @RANLIB@
 
 LDFLAGS = @LDFLAGS@
-LIBS    = @LIBS@ @CALIPER_LIBS@
+LIBS    = @LIBS@ @CALIPER_LIBS@ @NVCCLIBS@  @RAJA_LIBS@ @KOKKOS_LIBS@
 FLIBS   = @FLIBS@
 
-INCLUDES = @CALIPER_INCLUDE@
+INCLUDES = @CALIPER_INCLUDE@ @HYPRE_RAJA_INCLUDE@ @HYPRE_KOKKOS_INCLUDE@
 
 ##################################################################
 ##  LAPACK Library Flags 
@@ -115,6 +121,11 @@ MPILIBS    = @MPILIBS@
 MPIFLAGS   = @MPIFLAGS@
 
 ##################################################################
+##  NVCC options
+##################################################################
+HYPRE_NVCC_MAKEFILE = @HYPRE_NVCC_MAKEFILE@
+
+##################################################################
 ##  Caliper options
 ##################################################################
 CALIPER_INCLUDE = @CALIPER_INCLUDE@
@@ -129,3 +140,20 @@ HYPRE_FEI_SUBDIRS       = @HYPRE_FEI_SUBDIRS@
 HYPRE_FEI_SUPERLU_FILES = @HYPRE_FEI_SUPERLU_FILES@
 HYPRE_FEI_HYPRE_FILES   = @HYPRE_FEI_HYPRE_FILES@
 HYPRE_FEI_FEMLI_FILES   = @HYPRE_FEI_FEMLI_FILES@
+HYPRE_FEI_CDEFS         = @HYPRE_FEI_CDEFS@
+
+##################################################################
+##  RAJA options
+##################################################################
+HYPRE_RAJA_LIB_DIR       = @HYPRE_RAJA_LIB_DIR@
+HYPRE_RAJA_INCLUDE       = @HYPRE_RAJA_INCLUDE@
+HYPRE_RAJA_LIB           = @HYPRE_RAJA_LIB@
+
+##################################################################
+##  kokkos options
+##################################################################
+HYPRE_KOKKOS_SRC_DIR = @HYPRE_KOKKOS_SRC_DIR@
+HYPRE_KOKKOS_LIB_DIR = @HYPRE_KOKKOS_LIB_DIR@
+HYPRE_KOKKOS_INCLUDE = @HYPRE_KOKKOS_INCLUDE@
+HYPRE_KOKKOS_LIB         = @HYPRE_KOKKOS_LIB@
+ at HYPRE_KOKKOS_INC_FILE@
diff --git a/src/config/config.guess b/src/config/config.guess
index 2e9ad7f..69ebd09 100755
--- a/src/config/config.guess
+++ b/src/config/config.guess
@@ -2,7 +2,7 @@
 # Attempt to guess a canonical system name.
 #   Copyright 1992-2016 Free Software Foundation, Inc.
 
-timestamp='2016-10-02'
+timestamp='2017-03-02'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -1009,7 +1009,23 @@ EOF
     or32:Linux:*:* | or1k*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
-    padre:Linux:*:*)
+    ppc64le:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+    padre:Linux:*:*
 	echo sparc-unknown-linux-${LIBC}
 	exit ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
diff --git a/src/config/configure.in b/src/config/configure.in
index 0d1e280..078c268 100644
--- a/src/config/configure.in
+++ b/src/config/configure.in
@@ -55,8 +55,8 @@ dnl * Set package information so it only has to be modified in one place
 dnl *********************************************************************
 
 m4_define([M4_HYPRE_NAME],    [hypre])
-m4_define([M4_HYPRE_VERSION], [2.11.2])
-m4_define([M4_HYPRE_DATE],    [2017/03/13])
+m4_define([M4_HYPRE_VERSION], [2.12.1])
+m4_define([M4_HYPRE_DATE],    [2017/09/29])
 m4_define([M4_HYPRE_TIME],    [00:00:00])
 m4_define([M4_HYPRE_BUGS],    [hypre-support at llnl.gov])
 m4_define([M4_HYPRE_SRCDIR],  [`pwd`])
@@ -133,6 +133,9 @@ hypre_user_chose_mpi=no
 hypre_user_chose_blas=no
 hypre_user_chose_lapack=no
 hypre_user_chose_fei=no
+hypre_user_chose_cuda=no
+hypre_user_chose_raja=no
+hypre_user_chose_kokkos=no
 
 hypre_using_c=yes
 hypre_using_cxx=yes
@@ -145,6 +148,9 @@ hypre_using_mli=yes
 
 hypre_using_openmp=no
 hypre_using_insure=no
+hypre_using_cuda=no
+hypre_using_gpu=no
+hypre_using_um=no
 
 hypre_using_caliper=no
 hypre_user_gave_caliper_lib=no
@@ -217,6 +223,38 @@ then
    AC_DEFINE(HYPRE_BIGINT, 1)
 fi
 
+AC_ARG_ENABLE(single,
+AS_HELP_STRING([--enable-single],
+               [Use single precision values (default is NO).]),
+[case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_single=yes ;;
+    no)  hypre_using_single=no ;;
+    *)   AC_MSG_ERROR([Bad value ${enableval} for --enable-single]) ;;
+ esac],
+[hypre_using_single=no]
+)
+if test "$hypre_using_single" = "yes"
+then
+   AC_DEFINE(HYPRE_SINGLE, 1)
+fi
+
+AC_ARG_ENABLE(longdouble,
+AS_HELP_STRING([--enable-longdouble],
+               [Use long double precision values (default is NO).]),
+[case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_longdouble=yes ;;
+    no)  hypre_using_longdouble=no ;;
+    *)   AC_MSG_ERROR([Bad value ${enableval} for --enable-longdouble]) ;;
+ esac],
+[hypre_using_longdouble=no]
+)
+if test "$hypre_using_longdouble" = "yes"
+then
+   AC_DEFINE(HYPRE_LONG_DOUBLE, 1)
+fi
+
 AC_ARG_ENABLE(complex,
 AS_HELP_STRING([--enable-complex],
                [Use complex values (default is NO).]),
@@ -293,8 +331,6 @@ AS_HELP_STRING([--enable-global-partition],
     *)   hypre_using_global_partition=yes ;;
  esac]
 )
-dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set
-dnl * Need to change to a new approach that always defines variable to some value
 
 AC_ARG_ENABLE(fortran,
 AS_HELP_STRING([--enable-fortran],
@@ -307,6 +343,20 @@ AS_HELP_STRING([--enable-fortran],
 [hypre_using_fortran=yes]
 )
 
+AC_ARG_ENABLE(unified-memory,
+AS_HELP_STRING([--enable-unified-memory],
+               [Use unified memory for allocating the memory (default is NO).]),
+[case "${enableval}" in
+    yes) hypre_using_um=yes ;;
+    no)  hypre_using_um=no ;;
+    *)   hypre_using_um=no ;;
+ esac],
+[hypre_using_um=no]
+)
+
+dnl * The AC_DEFINE is below, after hypre_using_mpi is completely set
+dnl * Need to change to a new approach that always defines variable to some value
+
 dnl *********************************************************************
 dnl * Determine if user provided C compiler or flags
 dnl *********************************************************************
@@ -725,8 +775,7 @@ AC_ARG_WITH(openmp,
 AS_HELP_STRING([--with-openmp],
                [Use OpenMP.  This may affect which compiler is chosen.]),
 [case "${withval}" in
-    yes) hypre_using_openmp=yes
-         AC_DEFINE([HYPRE_USING_OPENMP],1,[Enable OpenMP support]) ;;
+    yes) hypre_using_openmp=yes;;
     no)  hypre_using_openmp=no ;;
  esac],
 [hypre_using_openmp=no]
@@ -769,6 +818,139 @@ AS_HELP_STRING([--with-MPI],
  esac]
 )
 
+AC_ARG_WITH(cuda,
+AS_HELP_STRING([--with-cuda],
+               [Use CUDA. Require cuda-8.0 or higher (default is NO).]),
+[case "$withval" in
+    yes) hypre_user_chose_cuda=yes
+    	 hypre_using_cuda=yes ;;
+    no)  hypre_using_cuda=no ;;
+    *)   hypre_using_cuda=no ;;
+ esac],
+[hypre_using_cuda=no]
+)
+
+AC_ARG_WITH(raja,
+AS_HELP_STRING([--with-raja],
+               [Use RAJA. Require RAJA package to be compiled properly (default is NO).]),
+[case "$withval" in
+    yes) hypre_user_chose_raja=yes;;
+    no)  hypre_user_chose_raja=no ;;
+    *)   hypre_user_chose_raja=no ;;
+ esac],
+[hypre_using_raja=no]
+)
+
+AC_ARG_WITH(kokkos,
+AS_HELP_STRING([--with-kokkos],
+               [Use Kokkos. Require kokkos package to be compiled properly(default is NO).]),
+[case "$withval" in
+    yes) hypre_user_chose_kokkos=yes ;;
+    no)  hypre_user_chose_kokkos=no ;;
+    *)   hypre_user_chose_kokkos=no ;;
+ esac]
+)
+
+AC_ARG_WITH(raja-include,
+AS_HELP_STRING([--with-raja-include=DIR],
+               [User specifies that RAJA/*.h is in DIR.  The options
+                --with-raja-include --with-raja-libs and 
+                --with-raja-lib-dirs must be used together.]),
+[for raja_dir in $withval; do
+    HYPRE_RAJA_INCLUDE="$HYPRE_RAJA_INCLUDE -I$raja_dir"
+ done;
+ hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(raja-lib,
+AS_HELP_STRING([--with-raja-lib=LIBS],
+               [LIBS is space-separated linkable list (enclosed in quotes) of libraries
+                needed for RAJA. OK to use -L and -l flags in the list]),
+[for raja_lib in $withval; do
+       HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB $raja_lib"
+ done;
+hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(raja-libs,
+AS_HELP_STRING([--with-raja-libs=LIBS],
+               [LIBS is space-separated list (enclosed in quotes) of libraries
+                needed for RAJA (base name only). The options --with-raja-libs and
+                --with-raja-lib-dirs must be used together.]),
+[for raja_lib in $withval; do
+    HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB -l$raja_lib"
+ done;
+hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(raja-lib-dirs,
+AS_HELP_STRING([--with-raja-lib-dirs=DIRS],
+               [DIRS is space-separated list (enclosed in quotes) of 
+                directories containing the libraries specified by 
+                --with-raja-libs, e.g "usr/lib /usr/local/lib".
+                The  options --with-raja-libs and --raja-blas-lib-dirs
+                must be used together.]),
+[for raja_lib_dir in $withval; do
+    HYPRE_RAJA_LIB_DIR="-L$raja_lib_dir $HYPRE_RAJA_LIB_DIR"
+ done;
+ hypre_user_chose_raja=yes]
+)
+
+AC_ARG_WITH(kokkos-include,
+AS_HELP_STRING([--with-kokkos-include=DIR],
+               [User specifies that KOKKOS headers is in DIR.  The options
+                --with-kokkos-include --with-kokkos-libs and
+                --with-kokkos-dirs must be used together.]), 
+[for kokkos_dir in $withval; do
+HYPRE_KOKKOS_INCLUDE="$HYPRE_KOKKOS_INCLUDE -I$kokkos_dir"
+done;
+hypre_user_chose_kokkos=yes]
+) 
+
+AC_ARG_WITH(kokkos-lib,
+AS_HELP_STRING([--with-kokkos-lib=LIBS],
+               [LIBS is space-separated linkable list (enclosed in quotes) of libraries
+                needed for KOKKOS. OK to use -L and -l flags in the list]),
+[for kokkos_lib in $withval; do
+       HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB $kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes]
+)
+
+AC_ARG_WITH(kokkos-libs,
+AS_HELP_STRING([--with-kokkos-libs=LIBS],
+               [LIBS is space-separated list (enclosed in quotes) of libraries
+                needed for KOKKOS (base name only). The options --with-kokkos-libs and
+                --with-kokkos-dirs must be used together.]),
+[for kokkos_lib in $withval; do
+    HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB -l$kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes]
+)
+
+AC_ARG_WITH(kokkos-dirs,
+AS_HELP_STRING([--with-kokkos-dirs=DIRS],
+               [DIRS is space-separated list (enclosed in quotes) of 
+                directories containing Makefile.kokkos.
+                The  options --with-kokkos-libs and --with-kokkos-dirs
+                must be used together.]),
+[for kokkos_lib_dir in $withval; do
+    HYPRE_KOKKOS_SRC_DIR="$kokkos_lib_dir"
+ done;
+hypre_user_chose_kokkos=yes]
+)
+
+AC_ARG_WITH(nvcc,
+AS_HELP_STRING([--with-nvcc],
+               [Use NVCC compiler (default is NO).]),
+[case "${withval}" in
+    yes) hypre_using_nvcc=yes ;;
+    no)  hypre_using_nvcc=no ;;
+    *)   AC_MSG_ERROR([Bad value ${withval} for --with-nvcc]) ;;
+ esac],
+[hypre_using_nvcc=no]
+)
+
 AC_ARG_WITH(caliper,
 AS_HELP_STRING([--with-caliper],
                [Use Caliper instrumentation (default is NO).]),
@@ -801,22 +983,48 @@ AS_HELP_STRING([--with-caliper-lib=LIBS],
 dnl *********************************************************************
 dnl * Select compilers if not already defined by command line options 
 dnl *********************************************************************
+if test "$hypre_using_cuda" = "yes"
+then
+	hypre_using_fortran=no
+	AC_CHECK_PROGS(CXX, [nvcc])
+	AC_CHECK_PROGS(CC, [nvcc])
+	if test "$hypre_user_chose_cxxcompilers" = "no"
+	then
+	   if test "$hypre_using_mpi" = "no"
+	   then
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         AC_CHECK_PROGS(CUDACXX, [xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
+              else
+		 AC_CHECK_PROGS(CUDACXX, [xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
+	      fi
+	   else
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         AC_CHECK_PROGS(CUDACXX, [mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
+	      else
+	         AC_CHECK_PROGS(CUDACXX, [mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
+	      fi
+	   fi
+	fi
+fi
+
 if test "$hypre_user_chose_ccompilers" = "no"
 then
    if test "$hypre_using_mpi" = "no"
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CC, [xlc_r xlC_r icc icpc gcc g++ pgcc pgCC cc CC kcc KCC])
+         AC_CHECK_PROGS(CC, [xlc_r xlC_r xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC])
       else
          AC_CHECK_PROGS(CC, [xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC])
       fi
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CC, [mpxlc mpixlc_r mpiicc mpicc mpipgcc])
+         AC_CHECK_PROGS(CC, [mpxlc mpixlc_r mpixlc mpiicc mpigcc mpicc mpipgcc])
       else
-         AC_CHECK_PROGS(CC, [mpxlc mpixlc mpiicc mpicc mpipgcc])
+         AC_CHECK_PROGS(CC, [mpxlc mpixlc mpiicc mpigcc mpicc mpipgcc])
       fi
    fi
 
@@ -832,16 +1040,16 @@ then
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CXX, [xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
+         AC_CHECK_PROGS(CXX, [xlC_r xlc_r xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
       else
          AC_CHECK_PROGS(CXX, [xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc])
       fi
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx_r mpiicpc mpiCC mpicxx mpipgCC])
+         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
       else
-         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx mpiicpc mpiCC mpicxx mpipgCC])
+         AC_CHECK_PROGS(CXX, [mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC])
       fi
    fi
 
@@ -1046,10 +1254,12 @@ then
    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
    HYPRE_FEI_SUBDIRS="fei-hypre"
    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
+   HYPRE_FEI_CDEFS=
    if test "$hypre_using_superlu" = "yes"
    then
       HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
       HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
+      HYPRE_FEI_CDEFS="$HYPRE_FEI_CDEFS -DHAVE_SUPERLU"
    else
       HYPRE_FEI_SUPERLU_FILES=
    fi
@@ -1057,6 +1267,7 @@ then
    then
       HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
       HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
+      HYPRE_FEI_CDEFS="$HYPRE_FEI_CDEFS -DHAVE_MLI"
    else
       HYPRE_FEI_FEMLI_FILES=
    fi
@@ -1071,6 +1282,7 @@ else
    HYPRE_FEI_HYPRE_FILES=
    HYPRE_FEI_FEMLI_FILES=
    HYPRE_FEI_SUPERLU_FILES=
+   HYPRE_FEI_CDEFS=
 fi
 
 dnl *********************************************************************
@@ -1254,6 +1466,172 @@ then
 fi
 
 dnl *********************************************************************
+dnl * Set nvcc options
+dnl *********************************************************************
+
+if test "$hypre_using_nvcc" = "yes"
+then
+   AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+   NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   CFLAGS="${CFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+   CXXFLAGS="${CXXFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+else
+   NVCCFLAGS=
+   NVCCLIBS=
+   HYPRE_NVCC_MAKEFILE="Makefile.empty"
+fi
+
+dnl *********************************************************************
+dnl * Set raja options
+dnl *********************************************************************
+if test "$hypre_user_chose_raja" = "yes"
+then
+   RAJA_LIBS=" $HYPRE_RAJA_LIB_DIR $HYPRE_RAJA_LIB "
+   if [test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"]
+   then 
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+      then
+        RAJAFLAGS=" -lRAJA "
+	LDFLAGS=" -ccbin=$CUDACXX -expt-extended-lambda -Xcompiler -fopenmp -arch compute_35 -lcudart -lcuda $RAJAFLAGS "
+   	CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register -Xcompiler $RAJAFLAGS "
+   	CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register  $RAJAFLAGS "
+   	if test "$hypre_using_um" != "yes"
+   	then
+		CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	   	CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+		AC_MSG_NOTICE([*******************************************************])
+       		AC_MSG_NOTICE([Configuring with --with-raja and --with-cuda without unified memory.]) 
+		AC_MSG_NOTICE([It only works for struct interface.])
+		AC_MSG_NOTICE([Try to confiure with --wiht-raja --with-cuda --enable-unified-memory])
+		AC_MSG_NOTICE([to use the cuda feature for the whold package])
+       		AC_MSG_NOTICE([*******************************************************])
+	else
+		CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+		CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU " 
+   	fi
+	hypre_user_chose_cuda=no
+   else 
+   	if test "$hypre_using_openmp" = "yes"
+	then
+	   CC=${CXX}
+	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   hypre_using_openmp=no
+	else
+	   CC=${CXX}
+   	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+   	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+	fi
+   fi
+fi
+dnl *********************************************************************
+dnl * Set kokkos options
+dnl *********************************************************************
+
+if test "$hypre_user_chose_kokkos" = "yes"
+then
+   if [test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"]
+   then 
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+   then
+      LDFLAGS=" -ccbin=$CUDACXX -arch compute_35 -lcudart -lcuda"
+      HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+      HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+      CC=${CXX}
+      LINK_CC=$LINK_CXX
+      CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      if test "$hypre_using_um" != "yes"
+      then
+          CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	  CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+          AC_MSG_NOTICE([*******************************************************])
+	  AC_MSG_NOTICE([Configuring with --with-kokkos and --with-cuda, but not with unified memory])
+	  AC_MSG_NOTICE([It only works for struct interface.])
+	  AC_MSG_NOTICE([Try to confiure with --wiht-raja --with-cuda --enable-unified-memory])
+	  AC_MSG_NOTICE([to use the cuda feature for the whold package])
+	  AC_MSG_NOTICE([*******************************************************])
+      else
+	  CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+	  AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi
+      hypre_user_chose_cuda=no
+   else
+      if test "$hypre_using_openmp" = "yes"  
+      then
+         CC=${CXX}
+	 CFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 CXXFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 hypre_using_openmp=no
+      else
+      	 CC=${CXX}
+	 CFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 CXXFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+	 AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi   
+  fi
+  KOKKOS_LIBS=" $HYPRE_KOKKOS_LIB_DIR $HYPRE_KOKKOS_LIB "
+fi
+dnl *********************************************************************
+dnl * Set cuda options
+dnl *********************************************************************
+if test "$hypre_user_chose_cuda" = "yes"
+then
+   LDFLAGS+=" -ccbin=$CUDACXX -arch compute_35 "
+   CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   if test "$hypre_using_um" != "yes"
+   then
+	CFLAGS+=" -DHYPRE_MEMORY_GPU"
+	CXXFLAGS+=" -DHYPRE_MEMORY_GPU"
+	AC_MSG_NOTICE([*******************************************************])
+       	AC_MSG_NOTICE([Configuring with --with-cuda=yes without unified memory.]) 
+	AC_MSG_NOTICE([It only works for struct interface.])
+	AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.])
+       	AC_MSG_NOTICE([*******************************************************])
+   else
+	CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+   	CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+	AC_DEFINE(HYPRE_USING_NVCC, 1, [Using nvcc compiler])
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   fi
+fi
+
+if test "$hypre_using_um" = "yes"
+then
+   LDFLAGS+=" -lcudart -lcuda "
+   CFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+   CXXFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+fi
+
+if test "$hypre_using_openmp" = "yes"
+then
+   AC_DEFINE([HYPRE_USING_OPENMP],1,[Enable OpenMP support])
+fi
+
+dnl *********************************************************************
 dnl * Set installation directories
 dnl *********************************************************************
 HYPRE_INSTALLDIR="${prefix}"
@@ -1341,10 +1719,12 @@ AC_SUBST(HYPRE_FEI_SUBDIRS)
 AC_SUBST(HYPRE_FEI_HYPRE_FILES)
 AC_SUBST(HYPRE_FEI_FEMLI_FILES)
 AC_SUBST(HYPRE_FEI_SUPERLU_FILES)
+AC_SUBST(HYPRE_FEI_CDEFS)
 
 dnl *********************************************************************
 dnl * BLAS & LAPACK related information
 dnl *********************************************************************
+AC_SUBST(HYPRE_KOKKOS_PATH)
 AC_SUBST(HYPRE_BLAS_SRC_DIR)
 AC_SUBST(HYPRE_BLAS_FILES)
 AC_SUBST(BLASLIBDIRS)
@@ -1355,6 +1735,30 @@ AC_SUBST(LAPACKLIBDIRS)
 AC_SUBST(LAPACKLIBS)
 
 dnl *********************************************************************
+dnl * RAJA information
+dnl *********************************************************************
+AC_SUBST(HYPRE_RAJA_LIB_DIR)
+AC_SUBST(HYPRE_RAJA_INCLUDE)
+AC_SUBST(HYPRE_RAJA_LIB)
+AC_SUBST(RAJA_LIBS)
+
+dnl *********************************************************************
+dnl * KOKKOS information
+dnl *********************************************************************
+AC_SUBST(HYPRE_KOKKOS_SRC_DIR)
+AC_SUBST(HYPRE_KOKKOS_LIB_DIR)
+AC_SUBST(HYPRE_KOKKOS_INCLUDE)
+AC_SUBST(HYPRE_KOKKOS_INC_FILE)	
+AC_SUBST(HYPRE_KOKKOS_LIB)
+AC_SUBST(KOKKOS_LIBS)
+dnl *********************************************************************
+dnl * NVCC stuff
+dnl *********************************************************************
+AC_SUBST(NVCCFLAGS)
+AC_SUBST(NVCCLIBS)
+AC_SUBST(HYPRE_NVCC_MAKEFILE)
+
+dnl *********************************************************************
 dnl * Caliper instrumentation
 dnl *********************************************************************
 AC_SUBST(CALIPER_INCLUDE)
diff --git a/src/configure b/src/configure
index c38f578..d9d5784 100755
--- a/src/configure
+++ b/src/configure
@@ -1,7 +1,7 @@
 #! /bin/sh
 # From configure.in Id.
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for hypre 2.11.2.
+# Generated by GNU Autoconf 2.69 for hypre 2.12.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -586,8 +586,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='hypre'
 PACKAGE_TARNAME='hypre'
-PACKAGE_VERSION='2.11.2'
-PACKAGE_STRING='hypre 2.11.2'
+PACKAGE_VERSION='2.12.1'
+PACKAGE_STRING='hypre 2.12.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -598,6 +598,19 @@ LIBOBJS
 AR
 CALIPER_LIBS
 CALIPER_INCLUDE
+HYPRE_NVCC_MAKEFILE
+NVCCLIBS
+NVCCFLAGS
+KOKKOS_LIBS
+HYPRE_KOKKOS_LIB
+HYPRE_KOKKOS_INC_FILE
+HYPRE_KOKKOS_INCLUDE
+HYPRE_KOKKOS_LIB_DIR
+HYPRE_KOKKOS_SRC_DIR
+RAJA_LIBS
+HYPRE_RAJA_LIB
+HYPRE_RAJA_INCLUDE
+HYPRE_RAJA_LIB_DIR
 LAPACKLIBS
 LAPACKLIBDIRS
 HYPRE_LAPACK_FILES
@@ -606,6 +619,8 @@ BLASLIBS
 BLASLIBDIRS
 HYPRE_BLAS_FILES
 HYPRE_BLAS_SRC_DIR
+HYPRE_KOKKOS_PATH
+HYPRE_FEI_CDEFS
 HYPRE_FEI_SUPERLU_FILES
 HYPRE_FEI_FEMLI_FILES
 HYPRE_FEI_HYPRE_FILES
@@ -653,6 +668,7 @@ LDFLAGS
 CFLAGS
 RANLIB
 SET_MAKE
+CUDACXX
 FC
 CXX
 CC
@@ -714,6 +730,8 @@ enable_option_checking
 enable_debug
 enable_shared
 enable_bigint
+enable_single
+enable_longdouble
 enable_complex
 enable_maxdim
 enable_persistent
@@ -721,6 +739,7 @@ enable_hopscotch
 with_no_global_partition
 enable_global_partition
 enable_fortran
+enable_unified_memory
 with_LD
 with_LDFLAGS
 with_extra_incpath
@@ -748,6 +767,18 @@ with_fei
 with_superlu
 with_mli
 with_MPI
+with_cuda
+with_raja
+with_kokkos
+with_raja_include
+with_raja_lib
+with_raja_libs
+with_raja_lib_dirs
+with_kokkos_include
+with_kokkos_lib
+with_kokkos_libs
+with_kokkos_dirs
+with_nvcc
 with_caliper
 with_caliper_include
 with_caliper_lib
@@ -1308,7 +1339,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures hypre 2.11.2 to adapt to many kinds of systems.
+\`configure' configures hypre 2.12.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1373,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of hypre 2.11.2:";;
+     short | recursive ) echo "Configuration of hypre 2.12.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1384,6 +1415,8 @@ Optional Features:
   --enable-debug          Set compiler flags for debugging.
   --enable-shared         Build shared libraries (default is NO).
   --enable-bigint         Use long long int for HYPRE_Int (default is NO).
+  --enable-single         Use single precision values (default is NO).
+  --enable-longdouble     Use long double precision values (default is NO).
   --enable-complex        Use complex values (default is NO).
   --enable-maxdim=MAXDIM  Change max dimension size to MAXDIM (default is 3).
                           Currently must be at least 3.
@@ -1393,6 +1426,8 @@ Optional Features:
   --enable-global-partition
                           Use global partitioning (default is NO).
   --enable-fortran        Require a working Fortran compiler (default is YES).
+  --enable-unified-memory Use unified memory for allocating the memory
+                          (default is NO).
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -1481,6 +1516,44 @@ Optional Packages:
   --with-mli              Use MLI
   --with-MPI              DEFAULT: Compile with MPI. Selecting --without-MPI
                           may affect which compiler is chosen.
+  --with-cuda             Use CUDA. Require cuda-8.0 or higher (default is
+                          NO).
+  --with-raja             Use RAJA. Require RAJA package to be compiled
+                          properly (default is NO).
+  --with-kokkos           Use Kokkos. Require kokkos package to be compiled
+                          properly(default is NO).
+  --with-raja-include=DIR User specifies that RAJA/*.h is in DIR. The options
+                          --with-raja-include --with-raja-libs and
+                          --with-raja-lib-dirs must be used together.
+  --with-raja-lib=LIBS    LIBS is space-separated linkable list (enclosed in
+                          quotes) of libraries needed for RAJA. OK to use -L
+                          and -l flags in the list
+  --with-raja-libs=LIBS   LIBS is space-separated list (enclosed in quotes) of
+                          libraries needed for RAJA (base name only). The
+                          options --with-raja-libs and --with-raja-lib-dirs
+                          must be used together.
+  --with-raja-lib-dirs=DIRS
+                          DIRS is space-separated list (enclosed in quotes) of
+                          directories containing the libraries specified by
+                          --with-raja-libs, e.g "usr/lib /usr/local/lib". The
+                          options --with-raja-libs and --raja-blas-lib-dirs
+                          must be used together.
+  --with-kokkos-include=DIR
+                          User specifies that KOKKOS headers is in DIR. The
+                          options --with-kokkos-include --with-kokkos-libs and
+                          --with-kokkos-dirs must be used together.
+  --with-kokkos-lib=LIBS  LIBS is space-separated linkable list (enclosed in
+                          quotes) of libraries needed for KOKKOS. OK to use -L
+                          and -l flags in the list
+  --with-kokkos-libs=LIBS LIBS is space-separated list (enclosed in quotes) of
+                          libraries needed for KOKKOS (base name only). The
+                          options --with-kokkos-libs and --with-kokkos-dirs
+                          must be used together.
+  --with-kokkos-dirs=DIRS DIRS is space-separated list (enclosed in quotes) of
+                          directories containing Makefile.kokkos. The options
+                          --with-kokkos-libs and --with-kokkos-dirs must be
+                          used together.
+  --with-nvcc             Use NVCC compiler (default is NO).
   --with-caliper          Use Caliper instrumentation (default is NO).
   --with-caliper-include=DIR
                           Directory where Caliper is installed.
@@ -1570,7 +1643,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-hypre configure 2.11.2
+hypre configure 2.12.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1867,7 +1940,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by hypre $as_me 2.11.2, which was
+It was created by hypre $as_me 2.12.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2253,8 +2326,8 @@ ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
 
 
 HYPRE_NAME="hypre"
-HYPRE_VERSION="2.11.2"
-HYPRE_DATE="2017/03/13"
+HYPRE_VERSION="2.12.1"
+HYPRE_DATE="2017/09/29"
 HYPRE_TIME="00:00:00"
 HYPRE_BUGS="hypre-support at llnl.gov"
 HYPRE_SRCDIR="`pwd`"
@@ -2306,6 +2379,9 @@ hypre_user_chose_mpi=no
 hypre_user_chose_blas=no
 hypre_user_chose_lapack=no
 hypre_user_chose_fei=no
+hypre_user_chose_cuda=no
+hypre_user_chose_raja=no
+hypre_user_chose_kokkos=no
 
 hypre_using_c=yes
 hypre_using_cxx=yes
@@ -2318,6 +2394,9 @@ hypre_using_mli=yes
 
 hypre_using_openmp=no
 hypre_using_insure=no
+hypre_using_cuda=no
+hypre_using_gpu=no
+hypre_using_um=no
 
 hypre_using_caliper=no
 hypre_user_gave_caliper_lib=no
@@ -2457,6 +2536,44 @@ then
 
 fi
 
+# Check whether --enable-single was given.
+if test "${enable_single+set}" = set; then :
+  enableval=$enable_single; case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_single=yes ;;
+    no)  hypre_using_single=no ;;
+    *)   as_fn_error $? "Bad value ${enableval} for --enable-single" "$LINENO" 5 ;;
+ esac
+else
+  hypre_using_single=no
+
+fi
+
+if test "$hypre_using_single" = "yes"
+then
+   $as_echo "#define HYPRE_SINGLE 1" >>confdefs.h
+
+fi
+
+# Check whether --enable-longdouble was given.
+if test "${enable_longdouble+set}" = set; then :
+  enableval=$enable_longdouble; case "${enableval}" in
+    yes) hypre_using_fei=no
+         hypre_using_longdouble=yes ;;
+    no)  hypre_using_longdouble=no ;;
+    *)   as_fn_error $? "Bad value ${enableval} for --enable-longdouble" "$LINENO" 5 ;;
+ esac
+else
+  hypre_using_longdouble=no
+
+fi
+
+if test "$hypre_using_longdouble" = "yes"
+then
+   $as_echo "#define HYPRE_LONG_DOUBLE 1" >>confdefs.h
+
+fi
+
 # Check whether --enable-complex was given.
 if test "${enable_complex+set}" = set; then :
   enableval=$enable_complex; case "${enableval}" in
@@ -2562,6 +2679,20 @@ else
 fi
 
 
+# Check whether --enable-unified-memory was given.
+if test "${enable_unified_memory+set}" = set; then :
+  enableval=$enable_unified_memory; case "${enableval}" in
+    yes) hypre_using_um=yes ;;
+    no)  hypre_using_um=no ;;
+    *)   hypre_using_um=no ;;
+ esac
+else
+  hypre_using_um=no
+
+fi
+
+
+
 if test "x$CC" = "x"
 then
    hypre_user_chose_ccompilers=no
@@ -3092,10 +3223,7 @@ fi
 # Check whether --with-openmp was given.
 if test "${with_openmp+set}" = set; then :
   withval=$with_openmp; case "${withval}" in
-    yes) hypre_using_openmp=yes
-
-$as_echo "#define HYPRE_USING_OPENMP 1" >>confdefs.h
- ;;
+    yes) hypre_using_openmp=yes;;
     no)  hypre_using_openmp=no ;;
  esac
 else
@@ -3149,6 +3277,149 @@ fi
 
 
 
+# Check whether --with-cuda was given.
+if test "${with_cuda+set}" = set; then :
+  withval=$with_cuda; case "$withval" in
+    yes) hypre_user_chose_cuda=yes
+    	 hypre_using_cuda=yes ;;
+    no)  hypre_using_cuda=no ;;
+    *)   hypre_using_cuda=no ;;
+ esac
+else
+  hypre_using_cuda=no
+
+fi
+
+
+
+# Check whether --with-raja was given.
+if test "${with_raja+set}" = set; then :
+  withval=$with_raja; case "$withval" in
+    yes) hypre_user_chose_raja=yes;;
+    no)  hypre_user_chose_raja=no ;;
+    *)   hypre_user_chose_raja=no ;;
+ esac
+else
+  hypre_using_raja=no
+
+fi
+
+
+
+# Check whether --with-kokkos was given.
+if test "${with_kokkos+set}" = set; then :
+  withval=$with_kokkos; case "$withval" in
+    yes) hypre_user_chose_kokkos=yes ;;
+    no)  hypre_user_chose_kokkos=no ;;
+    *)   hypre_user_chose_kokkos=no ;;
+ esac
+
+fi
+
+
+
+# Check whether --with-raja-include was given.
+if test "${with_raja_include+set}" = set; then :
+  withval=$with_raja_include; for raja_dir in $withval; do
+    HYPRE_RAJA_INCLUDE="$HYPRE_RAJA_INCLUDE -I$raja_dir"
+ done;
+ hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-raja-lib was given.
+if test "${with_raja_lib+set}" = set; then :
+  withval=$with_raja_lib; for raja_lib in $withval; do
+       HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB $raja_lib"
+ done;
+hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-raja-libs was given.
+if test "${with_raja_libs+set}" = set; then :
+  withval=$with_raja_libs; for raja_lib in $withval; do
+    HYPRE_RAJA_LIB="$HYPRE_RAJA_LIB -l$raja_lib"
+ done;
+hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-raja-lib-dirs was given.
+if test "${with_raja_lib_dirs+set}" = set; then :
+  withval=$with_raja_lib_dirs; for raja_lib_dir in $withval; do
+    HYPRE_RAJA_LIB_DIR="-L$raja_lib_dir $HYPRE_RAJA_LIB_DIR"
+ done;
+ hypre_user_chose_raja=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-include was given.
+if test "${with_kokkos_include+set}" = set; then :
+  withval=$with_kokkos_include; for kokkos_dir in $withval; do
+HYPRE_KOKKOS_INCLUDE="$HYPRE_KOKKOS_INCLUDE -I$kokkos_dir"
+done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-lib was given.
+if test "${with_kokkos_lib+set}" = set; then :
+  withval=$with_kokkos_lib; for kokkos_lib in $withval; do
+       HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB $kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-libs was given.
+if test "${with_kokkos_libs+set}" = set; then :
+  withval=$with_kokkos_libs; for kokkos_lib in $withval; do
+    HYPRE_KOKKOS_LIB="$HYPRE_KOKKOS_LIB -l$kokkos_lib"
+ done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-kokkos-dirs was given.
+if test "${with_kokkos_dirs+set}" = set; then :
+  withval=$with_kokkos_dirs; for kokkos_lib_dir in $withval; do
+    HYPRE_KOKKOS_SRC_DIR="$kokkos_lib_dir"
+ done;
+hypre_user_chose_kokkos=yes
+
+fi
+
+
+
+# Check whether --with-nvcc was given.
+if test "${with_nvcc+set}" = set; then :
+  withval=$with_nvcc; case "${withval}" in
+    yes) hypre_using_nvcc=yes ;;
+    no)  hypre_using_nvcc=no ;;
+    *)   as_fn_error $? "Bad value ${withval} for --with-nvcc" "$LINENO" 5 ;;
+ esac
+else
+  hypre_using_nvcc=no
+
+fi
+
+
+
 # Check whether --with-caliper was given.
 if test "${with_caliper+set}" = set; then :
   withval=$with_caliper; hypre_using_caliper=yes
@@ -3185,13 +3456,285 @@ if test "${with_caliper_lib+set}" = set; then :
 fi
 
 
+if test "$hypre_using_cuda" = "yes"
+then
+	hypre_using_fortran=no
+	for ac_prog in nvcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+$as_echo "$CXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CXX" && break
+done
+
+	for ac_prog in nvcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CC" && break
+done
+
+	if test "$hypre_user_chose_cxxcompilers" = "no"
+	then
+	   if test "$hypre_using_mpi" = "no"
+	   then
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         for ac_prog in xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+              else
+		 for ac_prog in xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+	      fi
+	   else
+	      if test "$hypre_using_openmp" = "yes"
+	      then
+	         for ac_prog in mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+	      else
+	         for ac_prog in mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUDACXX+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUDACXX"; then
+  ac_cv_prog_CUDACXX="$CUDACXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUDACXX="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUDACXX=$ac_cv_prog_CUDACXX
+if test -n "$CUDACXX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUDACXX" >&5
+$as_echo "$CUDACXX" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUDACXX" && break
+done
+
+	      fi
+	   fi
+	fi
+fi
+
 if test "$hypre_user_chose_ccompilers" = "no"
 then
    if test "$hypre_using_mpi" = "no"
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in xlc_r xlC_r icc icpc gcc g++ pgcc pgCC cc CC kcc KCC
+         for ac_prog in xlc_r xlC_r xlc xlC icc icpc gcc g++ pgcc pgCC cc CC kcc KCC
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3280,7 +3823,7 @@ done
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in mpxlc mpixlc_r mpiicc mpicc mpipgcc
+         for ac_prog in mpxlc mpixlc_r mpixlc mpiicc mpigcc mpicc mpipgcc
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3323,7 +3866,7 @@ fi
 done
 
       else
-         for ac_prog in mpxlc mpixlc mpiicc mpicc mpipgcc
+         for ac_prog in mpxlc mpixlc mpiicc mpigcc mpicc mpipgcc
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3380,7 +3923,7 @@ then
    then
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in xlC_r xlc_r icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
+         for ac_prog in xlC_r xlc_r xlC xlc icpc icc g++ gcc pgCC pgcc CC cc KCC kcc
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3469,7 +4012,7 @@ done
    else
       if test "$hypre_using_openmp" = "yes"
       then
-         for ac_prog in mpxlC mpixlcxx_r mpiicpc mpiCC mpicxx mpipgCC
+         for ac_prog in mpxlC mpixlcxx_r mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -3512,7 +4055,7 @@ fi
 done
 
       else
-         for ac_prog in mpxlC mpixlcxx mpiicpc mpiCC mpicxx mpipgCC
+         for ac_prog in mpxlC mpixlcxx mpixlC mpiicpc mpig++ mpiCC mpicxx mpipgCC
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -6731,10 +7274,12 @@ then
    HYPRE_FEI_SRC_DIR="$HYPRE_SRCDIR/FEI_mv"
    HYPRE_FEI_SUBDIRS="fei-hypre"
    HYPRE_FEI_HYPRE_FILES="$HYPRE_SRCDIR/FEI_mv/fei-hypre/*.o"
+   HYPRE_FEI_CDEFS=
    if test "$hypre_using_superlu" = "yes"
    then
       HYPRE_FEI_SUBDIRS="SuperLU $HYPRE_FEI_SUBDIRS"
       HYPRE_FEI_SUPERLU_FILES="$HYPRE_SRCDIR/FEI_mv/SuperLU/SRC/*.o"
+      HYPRE_FEI_CDEFS="$HYPRE_FEI_CDEFS -DHAVE_SUPERLU"
    else
       HYPRE_FEI_SUPERLU_FILES=
    fi
@@ -6742,6 +7287,7 @@ then
    then
       HYPRE_FEI_SUBDIRS="femli $HYPRE_FEI_SUBDIRS"
       HYPRE_FEI_FEMLI_FILES="$HYPRE_SRCDIR/FEI_mv/femli/*.o"
+      HYPRE_FEI_CDEFS="$HYPRE_FEI_CDEFS -DHAVE_MLI"
    else
       HYPRE_FEI_FEMLI_FILES=
    fi
@@ -6803,6 +7349,7 @@ else
    HYPRE_FEI_HYPRE_FILES=
    HYPRE_FEI_FEMLI_FILES=
    HYPRE_FEI_SUPERLU_FILES=
+   HYPRE_FEI_CDEFS=
 fi
 
 if test "$hypre_using_debug" = "yes"
@@ -7232,6 +7779,187 @@ $as_echo "$as_me: WARNING: *****************************************************
    fi
 fi
 
+
+if test "$hypre_using_nvcc" = "yes"
+then
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+   NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   CFLAGS="${CFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+   CXXFLAGS="${CXXFLAGS} -DUSE_NVTX -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED "
+else
+   NVCCFLAGS=
+   NVCCLIBS=
+   HYPRE_NVCC_MAKEFILE="Makefile.empty"
+fi
+
+if test "$hypre_user_chose_raja" = "yes"
+then
+   RAJA_LIBS=" $HYPRE_RAJA_LIB_DIR $HYPRE_RAJA_LIB "
+   if test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"
+   then
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+      then
+        RAJAFLAGS=" -lRAJA "
+	LDFLAGS=" -ccbin=$CUDACXX -expt-extended-lambda -Xcompiler -fopenmp -arch compute_35 -lcudart -lcuda $RAJAFLAGS "
+   	CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register -Xcompiler $RAJAFLAGS "
+   	CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -lcudart -lcuda -DHYPRE_USE_RAJA -Xcompiler -Wno-deprecated-register  $RAJAFLAGS "
+   	if test "$hypre_using_um" != "yes"
+   	then
+		CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	   	CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+       		{ $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-raja and --with-cuda without unified memory." >&5
+$as_echo "$as_me: Configuring with --with-raja and --with-cuda without unified memory." >&6;}
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&5
+$as_echo "$as_me: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&6;}
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: to use the cuda feature for the whold package" >&5
+$as_echo "$as_me: to use the cuda feature for the whold package" >&6;}
+       		{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+	else
+		CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+		CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+   	fi
+	hypre_user_chose_cuda=no
+   else
+   	if test "$hypre_using_openmp" = "yes"
+	then
+	   CC=${CXX}
+	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA -DHYPRE_USE_OPENMP "
+	   hypre_using_openmp=no
+	else
+	   CC=${CXX}
+   	   CFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+   	   CXXFLAGS+=" -std=c++11 -DHYPRE_USE_RAJA "
+	fi
+   fi
+fi
+
+if test "$hypre_user_chose_kokkos" = "yes"
+then
+   if test "$CXX" = "mpixlC" || test "$CXX" = "xlC_r"
+   then
+   	CFLAGS+=" -+ "
+   fi
+   if test "$hypre_using_cuda" = "yes"
+   then
+      LDFLAGS=" -ccbin=$CUDACXX -arch compute_35 -lcudart -lcuda"
+      HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+      HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+      CC=${CXX}
+      LINK_CC=$LINK_CXX
+      CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp --x cu -DHYPRE_USE_KOKKOS "
+      if test "$hypre_using_um" != "yes"
+      then
+          CFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+	  CXXFLAGS+=" -DHYPRE_MEMORY_GPU=1 "
+          { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-kokkos and --with-cuda, but not with unified memory" >&5
+$as_echo "$as_me: Configuring with --with-kokkos and --with-cuda, but not with unified memory" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&5
+$as_echo "$as_me: Try to confiure with --wiht-raja --with-cuda --enable-unified-memory" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: to use the cuda feature for the whold package" >&5
+$as_echo "$as_me: to use the cuda feature for the whold package" >&6;}
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+      else
+	  CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	  LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi
+      hypre_user_chose_cuda=no
+   else
+      if test "$hypre_using_openmp" = "yes"
+      then
+         CC=${CXX}
+	 CFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 CXXFLAGS+=" -fopenmp -std=c++11 -DHYPRE_USE_KOKKOS -DHYPRE_USE_OPENMP"
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 hypre_using_openmp=no
+      else
+      	 CC=${CXX}
+	 CFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 CXXFLAGS+=" -std=c++11 -DHYPRE_USE_KOKKOS "
+	 HYPRE_KOKKOS_INC_FILE="include $HYPRE_KOKKOS_SRC_DIR/Makefile.kokkos"
+	 HYPRE_KOKKOS_LIB_DIR="-L$HYPRE_KOKKOS_SRC_DIR/lib"
+	 LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+      fi
+  fi
+  KOKKOS_LIBS=" $HYPRE_KOKKOS_LIB_DIR $HYPRE_KOKKOS_LIB "
+fi
+if test "$hypre_user_chose_cuda" = "yes"
+then
+   LDFLAGS+=" -ccbin=$CUDACXX -arch compute_35 "
+   CFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   CXXFLAGS+=" -ccbin=$CUDACXX -expt-extended-lambda -arch compute_35 --std=c++11 -Xcompiler -fopenmp -Xcompiler -Wno-deprecated-register --x cu -DHYPRE_USE_CUDA "
+   if test "$hypre_using_um" != "yes"
+   then
+	CFLAGS+=" -DHYPRE_MEMORY_GPU"
+	CXXFLAGS+=" -DHYPRE_MEMORY_GPU"
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+       	{ $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-cuda=yes without unified memory." >&5
+$as_echo "$as_me: Configuring with --with-cuda=yes without unified memory." >&6;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: Use --enable-unified-memory to compile with unified memory." >&5
+$as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;}
+       	{ $as_echo "$as_me:${as_lineno-$LINENO}: *******************************************************" >&5
+$as_echo "$as_me: *******************************************************" >&6;}
+   else
+	CFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+   	CXXFLAGS+=" -DUSE_NVTX -DHYPRE_USE_GPU "
+	LDFLAGS+="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+
+$as_echo "#define HYPRE_USING_NVCC 1" >>confdefs.h
+
+	NVCCFLAGS="-O3 -arch=sm_60  -ccbin=xlc  -I ../hypre/include -I /usr/tcetmp/packages/spectrum_mpi/spectrum_mpi-10.1-xl-gcc-4.9.3/mpi/include/ -DUSE_NVTX -c -DHYPRE_USE_GPU -DHYPRE_USE_MANAGED -I /usr/local/cuda/include"
+   NVCCLIBS="-L /usr/local/cuda/lib64 -lcusparse -lcudart -lcublas -lnvToolsExt"
+   HYPRE_NVCC_MAKEFILE="Makefile.nvcc"
+   fi
+fi
+
+if test "$hypre_using_um" = "yes"
+then
+   LDFLAGS+=" -lcudart -lcuda "
+   CFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+   CXXFLAGS+=" -DHYPRE_USE_MANAGED -I /usr/local/cuda/include "
+fi
+
+if test "$hypre_using_openmp" = "yes"
+then
+
+$as_echo "#define HYPRE_USING_OPENMP 1" >>confdefs.h
+
+fi
+
 HYPRE_INSTALLDIR="${prefix}"
 HYPRE_LIBINSTALL="${libdir}"
 HYPRE_INCINSTALL="${includedir}"
@@ -7410,6 +8138,23 @@ $as_echo "$HYPRE_ARCH" >&6; }
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 ac_config_files="$ac_config_files config/Makefile.config"
 
 
@@ -7919,7 +8664,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by hypre $as_me 2.11.2, which was
+This file was extended by hypre $as_me 2.12.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7981,7 +8726,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-hypre config.status 2.11.2
+hypre config.status 2.12.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/src/distributed_ls/Euclid/Euclid_dh.c b/src/distributed_ls/Euclid/Euclid_dh.c
index a16393d..f2f937c 100644
--- a/src/distributed_ls/Euclid/Euclid_dh.c
+++ b/src/distributed_ls/Euclid/Euclid_dh.c
@@ -75,7 +75,7 @@ void Euclid_dhCreate(Euclid_dh *ctxOUT)
   strcpy(ctx->krylovMethod, "bicgstab");
   ctx->maxIts = 200;
   ctx->rtol = 1e-5;
-  ctx->atol = 1e-50;
+  ctx->atol = _ATOL_;
   ctx->its = 0;
   ctx->itsTotal = 0;
   ctx->setupCount = 0;
@@ -132,11 +132,15 @@ void Euclid_dhDestroy(Euclid_dh ctx)
 void Euclid_dhSetup(Euclid_dh ctx)
 {
   START_FUNC_DH
-  HYPRE_Int m, n, beg_row;
+  HYPRE_Int m, n, beg_row, ierr;
   HYPRE_Real t1;
   bool isSetup = ctx->isSetup;
   bool bj = false;
 
+  /* clear error flag if previously setup - DOK */
+  if(isSetup)
+     ierr = HYPRE_GetError(); HYPRE_ClearAllErrors();
+
   /*----------------------------------------------------
    * If Euclid was previously setup, print summary of
    * what happened during previous setup/solve
@@ -169,7 +173,9 @@ void Euclid_dhSetup(Euclid_dh ctx)
   if (ctx->A == NULL) {
     SET_V_ERROR("must set ctx->A before calling init");
   }
+
   EuclidGetDimensions(ctx->A, &beg_row, &m, &n); CHECK_V_ERROR;
+  
   ctx->m = m;
   ctx->n = n;
 
@@ -293,6 +299,9 @@ END_OF_FUNCTION: ;
 
   ctx->isSetup = true;
 
+  /* setup done. Reset error flag - DOK*/
+  hypre_error_flag |= ierr;
+
   END_FUNC_DH
 }
 
@@ -410,7 +419,7 @@ void compute_rho_private(Euclid_dh ctx)
       bufGlobal[1] = bufLocal[1];
       bufGlobal[2] = bufLocal[2];
     } else {
-      hypre_MPI_Reduce(bufLocal, bufGlobal, 3, hypre_MPI_DOUBLE, hypre_MPI_SUM, 0, comm_dh);
+      hypre_MPI_Reduce(bufLocal, bufGlobal, 3, hypre_MPI_REAL, hypre_MPI_SUM, 0, comm_dh);
     }
 
     if (myid_dh == 0) {
@@ -888,7 +897,7 @@ void reduce_timings_private(Euclid_dh ctx)
     HYPRE_Real bufOUT[TIMING_BINS];
 
     memcpy(bufOUT, ctx->timing, TIMING_BINS*sizeof(HYPRE_Real));
-    hypre_MPI_Reduce(bufOUT, ctx->timing, TIMING_BINS, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh);
+    hypre_MPI_Reduce(bufOUT, ctx->timing, TIMING_BINS, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh);
   }
 
   ctx->timingsWereReduced = true;
diff --git a/src/distributed_ls/Euclid/ExternalRows_dh.c b/src/distributed_ls/Euclid/ExternalRows_dh.c
index 860b7f8..98edc81 100644
--- a/src/distributed_ls/Euclid/ExternalRows_dh.c
+++ b/src/distributed_ls/Euclid/ExternalRows_dh.c
@@ -317,7 +317,7 @@ void rcv_external_rows_private(ExternalRows_dh er)
     nz = rcv_nz_counts[i];
     hypre_MPI_Irecv(extRowCval+offset, nz, HYPRE_MPI_INT,    nabor, CVAL_TAG, comm_dh, er->req1+i);
     hypre_MPI_Irecv(extRowFill+offset, nz, HYPRE_MPI_INT,    nabor, FILL_TAG, comm_dh, er->req2+i);
-    hypre_MPI_Irecv(extRowAval+offset, nz, hypre_MPI_DOUBLE, nabor, AVAL_TAG, comm_dh, er->req3+i);
+    hypre_MPI_Irecv(extRowAval+offset, nz, hypre_MPI_REAL, nabor, AVAL_TAG, comm_dh, er->req3+i);
     offset += nz;
   }
 
@@ -537,7 +537,7 @@ void send_external_rows_private(ExternalRows_dh er)
     HYPRE_Int nabor = hiNabors[i];
     hypre_MPI_Isend(cvalSend, nz, HYPRE_MPI_INT,    nabor, CVAL_TAG, comm_dh, er->cval_req+i);
     hypre_MPI_Isend(fillSend, nz, HYPRE_MPI_INT,    nabor, FILL_TAG, comm_dh, er->fill_req+i); 
-    hypre_MPI_Isend(avalSend, nz, hypre_MPI_DOUBLE, nabor, AVAL_TAG, comm_dh, er->aval_req+i);
+    hypre_MPI_Isend(avalSend, nz, hypre_MPI_REAL, nabor, AVAL_TAG, comm_dh, er->aval_req+i);
   }
   END_FUNC_DH
 }
diff --git a/src/distributed_ls/Euclid/Factor_dh.c b/src/distributed_ls/Euclid/Factor_dh.c
index 2f024a4..c3e18c6 100644
--- a/src/distributed_ls/Euclid/Factor_dh.c
+++ b/src/distributed_ls/Euclid/Factor_dh.c
@@ -379,7 +379,7 @@ static HYPRE_Int setup_receives_private(Factor_dh mat, HYPRE_Int *beg_rows, HYPR
     hypre_MPI_Request_free(&request); 
 
     /* set up persistent comms for receiving the values from this_pe */
-    hypre_MPI_Recv_init(recvBuf+i, j-i, hypre_MPI_DOUBLE, this_pe, 555,
+    hypre_MPI_Recv_init(recvBuf+i, j-i, hypre_MPI_REAL, this_pe, 555,
                         comm_dh, req+num_recv); 
     ++num_recv;
   }
@@ -457,7 +457,7 @@ static void setup_sends_private(Factor_dh mat, HYPRE_Int *inlist,
       ++count;
 
       /* Set up the send */
-      hypre_MPI_Send_init(sendBuf, inlist[i], hypre_MPI_DOUBLE, i, 555, comm_dh, sendReq); 
+      hypre_MPI_Send_init(sendBuf, inlist[i], hypre_MPI_REAL, i, 555, comm_dh, sendReq); 
     }
   }
 
@@ -1128,7 +1128,7 @@ HYPRE_Real Factor_dhMaxPivotInverse(Factor_dh mat)
   if (np_dh == 1) {
     minGlobal = min;
   } else {
-    hypre_MPI_Reduce(&min, &minGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MIN, 0, comm_dh);
+    hypre_MPI_Reduce(&min, &minGlobal, 1, hypre_MPI_REAL, hypre_MPI_MIN, 0, comm_dh);
   }
 
   if (minGlobal == 0) {
@@ -1155,7 +1155,7 @@ HYPRE_Real Factor_dhMaxValue(Factor_dh mat)
   if (np_dh == 1) {
     maxGlobal = max;
   } else {
-    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh);
+    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh);
   }
   END_FUNC_VAL(maxGlobal)
 }
@@ -1185,7 +1185,7 @@ HYPRE_Real Factor_dhCondEst(Factor_dh mat, Euclid_dh ctx)
   if (np_dh == 1) {
     maxGlobal = max;
   } else {
-    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, comm_dh);
+    hypre_MPI_Reduce(&max, &maxGlobal, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, comm_dh);
   }
   END_FUNC_VAL(maxGlobal)
 }
diff --git a/src/distributed_ls/Euclid/Mat_dh.c b/src/distributed_ls/Euclid/Mat_dh.c
index ad8248c..e3b7ff2 100644
--- a/src/distributed_ls/Euclid/Mat_dh.c
+++ b/src/distributed_ls/Euclid/Mat_dh.c
@@ -251,7 +251,7 @@ void setup_matvec_receives_private(Mat_dh mat, HYPRE_Int *beg_rows, HYPRE_Int *e
     /* Count of number of number of indices needed from this_pe */
     outlist[this_pe] = j-i;
 
-    ierr = hypre_MPI_Recv_init(&mat->recvbuf[i+m], j-i, hypre_MPI_DOUBLE, this_pe, 555,
+    ierr = hypre_MPI_Recv_init(&mat->recvbuf[i+m], j-i, hypre_MPI_REAL, this_pe, 555,
             comm_dh, &mat->recv_req[mat->num_recv]); CHECK_MPI_V_ERROR(ierr);
 
     mat->num_recv++;
@@ -289,7 +289,7 @@ void setup_matvec_sends_private(Mat_dh mat, HYPRE_Int *inlist)
       ierr = hypre_MPI_Irecv(&mat->sendind[j], inlist[i], HYPRE_MPI_INT, i, 444, comm_dh,
                             &requests[mat->num_send]); CHECK_MPI_V_ERROR(ierr);
       /* Set up the send */
-      ierr = hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 555, comm_dh,
+      ierr = hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 555, comm_dh,
                        &mat->send_req[mat->num_send]); CHECK_MPI_V_ERROR(ierr);
 
       mat->num_send++;
@@ -601,8 +601,8 @@ void Mat_dhReduceTiming(Mat_dh mat)
   if (mat->time[MATVEC_MPI_TIME]) {
     mat->time[MATVEC_RATIO] = mat->time[MATVEC_TIME] / mat->time[MATVEC_MPI_TIME];
   }
-  hypre_MPI_Allreduce(mat->time, mat->time_min, MAT_DH_BINS, hypre_MPI_DOUBLE, hypre_MPI_MIN, comm_dh);
-  hypre_MPI_Allreduce(mat->time, mat->time_max, MAT_DH_BINS, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm_dh);
+  hypre_MPI_Allreduce(mat->time, mat->time_min, MAT_DH_BINS, hypre_MPI_REAL, hypre_MPI_MIN, comm_dh);
+  hypre_MPI_Allreduce(mat->time, mat->time_max, MAT_DH_BINS, hypre_MPI_REAL, hypre_MPI_MAX, comm_dh);
   END_FUNC_DH
 }
 
diff --git a/src/distributed_ls/Euclid/TimeLog_dh.c b/src/distributed_ls/Euclid/TimeLog_dh.c
index ab041d5..36be5b4 100644
--- a/src/distributed_ls/Euclid/TimeLog_dh.c
+++ b/src/distributed_ls/Euclid/TimeLog_dh.c
@@ -121,8 +121,8 @@ void TimeLog_dhPrint(TimeLog_dh t, FILE *fp, bool allPrint)
     hypre_sprintf(t->desc[t->last], "========== totals, and reset ==========\n");
     t->last += 1;
 
-    hypre_MPI_Allreduce(t->time, timeMax, t->last, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm_dh);
-    hypre_MPI_Allreduce(t->time, timeMin, t->last, hypre_MPI_DOUBLE, hypre_MPI_MIN, comm_dh);
+    hypre_MPI_Allreduce(t->time, timeMax, t->last, hypre_MPI_REAL, hypre_MPI_MAX, comm_dh);
+    hypre_MPI_Allreduce(t->time, timeMin, t->last, hypre_MPI_REAL, hypre_MPI_MIN, comm_dh);
     wasSummed = true;
   }
 
diff --git a/src/distributed_ls/Euclid/_hypre_Euclid.h b/src/distributed_ls/Euclid/_hypre_Euclid.h
index e8692ac..340eed7 100644
--- a/src/distributed_ls/Euclid/_hypre_Euclid.h
+++ b/src/distributed_ls/Euclid/_hypre_Euclid.h
@@ -198,8 +198,14 @@ you need to write EUCLID_GET_ROW() functions: see src/getRow.c
 #define FABS(a)    ((a) < 0 ? -(a) : a)
 #endif
 
-/* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */
+#ifdef HYPRE_SINGLE
+#define _ATOL_ 1.0e-16   /* used to compute absolute tolerance for Euclid's internal Krylov solvers */
+#define _MATLAB_ZERO_  1e-30 /* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */
+#else // default
+#define _ATOL_ 1.0e-50
 #define _MATLAB_ZERO_  1e-100
+#endif
+
 
 
 /*---------------------------------------------------------------------- 
@@ -481,7 +487,11 @@ extern void  printErrorMsg(FILE *fp);
 #endif
 
 #define MSG_BUF_SIZE_DH MAX(1024, hypre_MPI_MAX_ERROR_STRING)
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
+static char  msgBuf_dh[MSG_BUF_SIZE_DH];
+#else
 extern char  msgBuf_dh[MSG_BUF_SIZE_DH];
+#endif
 
 /* Each processor (may) open a logfile.
  * The bools are switches for controlling the amount of informational 
diff --git a/src/distributed_ls/Euclid/blas_dh.c b/src/distributed_ls/Euclid/blas_dh.c
index 0724144..2d9516b 100644
--- a/src/distributed_ls/Euclid/blas_dh.c
+++ b/src/distributed_ls/Euclid/blas_dh.c
@@ -119,7 +119,7 @@ HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y)
     }
 
     if (np_dh > 1) {
-      hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm_dh);
+      hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm_dh);
     } else {
       result = local_result;
     }
@@ -145,7 +145,7 @@ HYPRE_Real Norm2(HYPRE_Int n, HYPRE_Real *x)
   }
 
   if (np_dh > 1) {
-    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm_dh);
+    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm_dh);
   } else {
     result = local_result;
   }
diff --git a/src/distributed_ls/Euclid/globalObjects.c b/src/distributed_ls/Euclid/globalObjects.c
index 86be1af..5f45463 100644
--- a/src/distributed_ls/Euclid/globalObjects.c
+++ b/src/distributed_ls/Euclid/globalObjects.c
@@ -30,7 +30,10 @@ Parser_dh   parser_dh = NULL;   /* for setting/getting runtime options */
 TimeLog_dh  tlog_dh = NULL;     /* internal timing  functionality */
 Mem_dh      mem_dh = NULL;      /* memory management */
 FILE        *logFile = NULL;
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
+#else
 char        msgBuf_dh[MSG_BUF_SIZE_DH]; /* for internal use */
+#endif
 HYPRE_Int         np_dh = 1;     /* number of processors and subdomains */
 HYPRE_Int         myid_dh = 0;   /* rank of this processor (and subdomain) */
 MPI_Comm    comm_dh = 0;
diff --git a/src/distributed_ls/Euclid/macros_dh.h b/src/distributed_ls/Euclid/macros_dh.h
index 76de87a..66a2fc8 100644
--- a/src/distributed_ls/Euclid/macros_dh.h
+++ b/src/distributed_ls/Euclid/macros_dh.h
@@ -34,7 +34,11 @@
 #endif
 
 /* used in Mat_SEQ_PrintTriples, so matlab won't discard zeros (yuck!) */
+#ifdef HYPRE_SINGLE
+#define _MATLAB_ZERO_  1e-30
+#else // default
 #define _MATLAB_ZERO_  1e-100
+#endif
 
 
 /*---------------------------------------------------------------------- 
diff --git a/src/distributed_ls/Euclid/mat_dh_private.c b/src/distributed_ls/Euclid/mat_dh_private.c
index 54a6e21..12927c0 100644
--- a/src/distributed_ls/Euclid/mat_dh_private.c
+++ b/src/distributed_ls/Euclid/mat_dh_private.c
@@ -1082,7 +1082,7 @@ void partition_and_distribute_metis_private(Mat_dh A, Mat_dh *Bout)
       }
 
       hypre_MPI_Isend(cval+rp[i], count, HYPRE_MPI_INT, owner, CVAL_TAG, comm_dh, send_req+2*i);
-      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_DOUBLE, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
+      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_REAL, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
     }
   } 
 
@@ -1105,7 +1105,7 @@ void partition_and_distribute_metis_private(Mat_dh A, Mat_dh *Bout)
       }
 
       hypre_MPI_Irecv(cval+rp[i], count, HYPRE_MPI_INT, 0, CVAL_TAG, comm_dh, rcv_req+2*i);
-      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_DOUBLE, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
+      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_REAL, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
     }
   }
 
@@ -1195,7 +1195,7 @@ void partition_and_distribute_private(Mat_dh A, Mat_dh *Bout)
       }
 
       hypre_MPI_Isend(cval+rp[i], count, HYPRE_MPI_INT, owner, CVAL_TAG, comm_dh, send_req+2*i);
-      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_DOUBLE, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
+      hypre_MPI_Isend(aval+rp[i], count, hypre_MPI_REAL, owner, AVAL_TAG, comm_dh, send_req+2*i+1);
     }
   } 
 
@@ -1218,7 +1218,7 @@ void partition_and_distribute_private(Mat_dh A, Mat_dh *Bout)
       }
 
       hypre_MPI_Irecv(cval+rp[i], count, HYPRE_MPI_INT, 0, CVAL_TAG, comm_dh, rcv_req+2*i);
-      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_DOUBLE, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
+      hypre_MPI_Irecv(aval+rp[i], count, hypre_MPI_REAL, 0, AVAL_TAG, comm_dh, rcv_req+2*i+1);
     }
   }
 
diff --git a/src/distributed_ls/ParaSails/ConjGrad.c b/src/distributed_ls/ParaSails/ConjGrad.c
index b38ff84..f1485fa 100644
--- a/src/distributed_ls/ParaSails/ConjGrad.c
+++ b/src/distributed_ls/ParaSails/ConjGrad.c
@@ -37,7 +37,7 @@ static HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y, MPI_Comm
     HYPRE_Int one = 1;
     local_result = hypre_F90_NAME_BLAS(ddot, DDOT)(&n, x, &one, y, &one);
 
-    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
 
     return result;
 }
diff --git a/src/distributed_ls/ParaSails/DiagScale.c b/src/distributed_ls/ParaSails/DiagScale.c
index d3dc4e8..e46710a 100644
--- a/src/distributed_ls/ParaSails/DiagScale.c
+++ b/src/distributed_ls/ParaSails/DiagScale.c
@@ -76,7 +76,7 @@ static void ExchangeDiagEntries(MPI_Comm comm, Matrix *mat, HYPRE_Int reqlen,
         }
 
         /* Post receive for diagonal values */
-        hypre_MPI_Irecv(&diags[i], j-i, hypre_MPI_DOUBLE, this_pe, DIAG_VALS_TAG, 
+        hypre_MPI_Irecv(&diags[i], j-i, hypre_MPI_REAL, this_pe, DIAG_VALS_TAG, 
 	    comm, &requests[*num_requests]);
 
         /* Request rows in reqind[i..j-1] */
@@ -130,7 +130,7 @@ static void ExchangeDiagEntriesServer(MPI_Comm comm, Matrix *mat,
 	    sendbuf[j] = local_diags[recvbuf[j] - mat->beg_row];
 
 	/* Use ready-mode send, since receives already posted */
-	hypre_MPI_Irsend(sendbuf, count, hypre_MPI_DOUBLE, source, 
+	hypre_MPI_Irsend(sendbuf, count, hypre_MPI_REAL, source, 
 	    DIAG_VALS_TAG, comm, &requests[i]);
     }
 }
diff --git a/src/distributed_ls/ParaSails/FGmres.c b/src/distributed_ls/ParaSails/FGmres.c
index 63f4c8b..4a41c3b 100644
--- a/src/distributed_ls/ParaSails/FGmres.c
+++ b/src/distributed_ls/ParaSails/FGmres.c
@@ -37,7 +37,7 @@ static HYPRE_Real InnerProd(HYPRE_Int n, HYPRE_Real *x, HYPRE_Real *y, MPI_Comm
     HYPRE_Int one = 1;
     local_result = hypre_F90_NAME_BLAS(ddot, DDOT)(&n, x, &one, y, &one);
 
-    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&local_result, &result, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
 
     return result;
 }
diff --git a/src/distributed_ls/ParaSails/LoadBal.c b/src/distributed_ls/ParaSails/LoadBal.c
index cf023c9..005da13 100644
--- a/src/distributed_ls/ParaSails/LoadBal.c
+++ b/src/distributed_ls/ParaSails/LoadBal.c
@@ -57,7 +57,7 @@ void LoadBalInit(MPI_Comm comm, HYPRE_Real local_cost, HYPRE_Real beta,
 
     cost = (HYPRE_Real *) malloc(npes * sizeof(HYPRE_Real));
 
-    hypre_MPI_Allgather(&local_cost, 1, hypre_MPI_DOUBLE, cost, 1, hypre_MPI_DOUBLE, comm);
+    hypre_MPI_Allgather(&local_cost, 1, hypre_MPI_REAL, cost, 1, hypre_MPI_REAL, comm);
 
     /* Compute the average cost */
     average = 0.0;
@@ -284,7 +284,7 @@ void LoadBalRecipSend(MPI_Comm comm, HYPRE_Int num_taken,
             bufferp += len;
         }
 
-        hypre_MPI_Isend(recip_data[i].buffer, buflen, hypre_MPI_DOUBLE, recip_data[i].pe,
+        hypre_MPI_Isend(recip_data[i].buffer, buflen, hypre_MPI_REAL, recip_data[i].pe,
             LOADBAL_REP_TAG, comm, &request[i]);
 
         MatrixDestroy(mat);
@@ -311,10 +311,10 @@ void LoadBalDonorRecv(MPI_Comm comm, Matrix *mat,
     {
         hypre_MPI_Probe(hypre_MPI_ANY_SOURCE, LOADBAL_REP_TAG, comm, &status);
         source = status.hypre_MPI_SOURCE;
-        hypre_MPI_Get_count(&status, hypre_MPI_DOUBLE, &count);
+        hypre_MPI_Get_count(&status, hypre_MPI_REAL, &count);
 
         buffer = (HYPRE_Real *) malloc(count * sizeof(HYPRE_Real));
-        hypre_MPI_Recv(buffer, count, hypre_MPI_DOUBLE, source, LOADBAL_REP_TAG, 
+        hypre_MPI_Recv(buffer, count, hypre_MPI_REAL, source, LOADBAL_REP_TAG, 
            comm, &status);
 
 	/* search for which entry in donor_data this message corresponds to */
diff --git a/src/distributed_ls/ParaSails/Matrix.c b/src/distributed_ls/ParaSails/Matrix.c
index fc996c0..f943f26 100644
--- a/src/distributed_ls/ParaSails/Matrix.c
+++ b/src/distributed_ls/ParaSails/Matrix.c
@@ -532,7 +532,7 @@ void RhsRead(HYPRE_Real *rhs, Matrix *mat, char *filename)
 
     if (mype != 0)
     {
-	hypre_MPI_Recv(rhs, num_local, hypre_MPI_DOUBLE, 0, 0, mat->comm, &status);
+	hypre_MPI_Recv(rhs, num_local, hypre_MPI_REAL, 0, 0, mat->comm, &status);
 	return;
     }
 
@@ -567,7 +567,7 @@ void RhsRead(HYPRE_Real *rhs, Matrix *mat, char *filename)
 	  else
             hypre_fscanf(file, "%lf", &buffer[i]);
 
-	hypre_MPI_Send(buffer, num_local, hypre_MPI_DOUBLE, pe, 0, mat->comm);
+	hypre_MPI_Send(buffer, num_local, hypre_MPI_REAL, pe, 0, mat->comm);
     }
 
     free(buffer);
@@ -614,10 +614,10 @@ static void SetupReceives(Matrix *mat, HYPRE_Int reqlen, HYPRE_Int *reqind, HYPR
 	/* Count of number of number of indices needed from this_pe */
         outlist[this_pe] = j-i;
 
-        hypre_MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_DOUBLE, this_pe, 555,
+        hypre_MPI_Recv_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_REAL, this_pe, 555,
 	    comm, &mat->recv_req[mat->num_recv]);
 
-        hypre_MPI_Send_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_DOUBLE, this_pe, 666,
+        hypre_MPI_Send_init(&mat->recvbuf[i+num_local], j-i, hypre_MPI_REAL, this_pe, 666,
 	    comm, &mat->send_req2[mat->num_recv]);
 
         mat->num_recv++;
@@ -665,11 +665,11 @@ static void SetupSends(Matrix *mat, HYPRE_Int *inlist)
                 &requests[mat->num_send]);
 
 	    /* Set up the send */
-	    hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 555, comm,
+	    hypre_MPI_Send_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 555, comm,
 		&mat->send_req[mat->num_send]);
 
 	    /* Set up the receive for the transpose  */
-	    hypre_MPI_Recv_init(&mat->sendbuf[j], inlist[i], hypre_MPI_DOUBLE, i, 666, comm,
+	    hypre_MPI_Recv_init(&mat->sendbuf[j], inlist[i], hypre_MPI_REAL, i, 666, comm,
 		&mat->recv_req2[mat->num_send]);
 
 	    mat->num_send++;
diff --git a/src/distributed_ls/ParaSails/ParaSails.c b/src/distributed_ls/ParaSails/ParaSails.c
index 6aed20d..b96a80b 100644
--- a/src/distributed_ls/ParaSails/ParaSails.c
+++ b/src/distributed_ls/ParaSails/ParaSails.c
@@ -371,7 +371,7 @@ static void SendReplyStoredRows(MPI_Comm comm, Numbering *numb,
 
     hypre_MPI_Request_free(request);
 
-    hypre_MPI_Isend(valbuf, valbufp-valbuf, hypre_MPI_DOUBLE, dest, ROW_REPV_TAG,
+    hypre_MPI_Isend(valbuf, valbufp-valbuf, hypre_MPI_REAL, dest, ROW_REPV_TAG,
         comm, request);
 }
 
@@ -400,7 +400,7 @@ static void ReceiveReplyStoredRows(MPI_Comm comm, Numbering *numb,
     ind = StoredRowsAllocInd(stored_rows, count);
     hypre_MPI_Recv(ind, count, HYPRE_MPI_INT, source, ROW_REPI_TAG, comm, &status);
     val = StoredRowsAllocVal(stored_rows, count);
-    hypre_MPI_Recv(val, count, hypre_MPI_DOUBLE, source, ROW_REPV_TAG, comm, &status);
+    hypre_MPI_Recv(val, count, hypre_MPI_REAL, source, ROW_REPV_TAG, comm, &status);
 
     /* Parse the message */
     num_rows = *ind++; /* number of rows */
@@ -1430,7 +1430,7 @@ static HYPRE_Real SelectThresh(MPI_Comm comm, Matrix *A, DiagScale *diag_scale,
     }
 
     /* Find the average across all processors */
-    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
     hypre_MPI_Comm_size(comm, &npes);
 
     free(buffer);
@@ -1486,7 +1486,7 @@ static HYPRE_Real SelectFilter(MPI_Comm comm, Matrix *M, DiagScale *diag_scale,
     }
 
     /* Find the average across all processors */
-    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+    hypre_MPI_Allreduce(&localsum, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
     hypre_MPI_Comm_size(comm, &npes);
 
     free(buffer);
@@ -1980,9 +1980,9 @@ HYPRE_Real ParaSailsStatsPattern(ParaSails *ps, Matrix *A)
     }
 
     hypre_MPI_Allreduce(&ps->setup_pattern_time, &max_pattern_time, 
-	1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
-    hypre_MPI_Allreduce(&ps->cost, &max_cost, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
-    hypre_MPI_Allreduce(&ps->cost, &ave_cost, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, comm);
+	1, hypre_MPI_REAL, hypre_MPI_MAX, comm);
+    hypre_MPI_Allreduce(&ps->cost, &max_cost, 1, hypre_MPI_REAL, hypre_MPI_MAX, comm);
+    hypre_MPI_Allreduce(&ps->cost, &ave_cost, 1, hypre_MPI_REAL, hypre_MPI_SUM, comm);
     ave_cost = ave_cost / (HYPRE_Real) npes;
 
     if (mype)
@@ -2029,13 +2029,13 @@ void ParaSailsStatsValues(ParaSails *ps, Matrix *A)
     }
 
     hypre_MPI_Allreduce(&ps->setup_values_time, &max_values_time, 
-	1, hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
+	1, hypre_MPI_REAL, hypre_MPI_MAX, comm);
 
     if (!mype)
         setup_times = (HYPRE_Real *) malloc(npes * sizeof(HYPRE_Real));
 
     temp = ps->setup_pattern_time + ps->setup_values_time;
-    hypre_MPI_Gather(&temp, 1, hypre_MPI_DOUBLE, setup_times, 1, hypre_MPI_DOUBLE, 0, comm);
+    hypre_MPI_Gather(&temp, 1, hypre_MPI_REAL, setup_times, 1, hypre_MPI_REAL, 0, comm);
 
     if (mype)
         return;
diff --git a/src/distributed_ls/ParaSails/driver.c b/src/distributed_ls/ParaSails/driver.c
index 9878f6f..6cd3feb 100644
--- a/src/distributed_ls/ParaSails/driver.c
+++ b/src/distributed_ls/ParaSails/driver.c
@@ -139,11 +139,11 @@ HYPRE_Int main(HYPRE_Int argc, char *argv[])
 #endif
 	    }
 
-	    hypre_MPI_Bcast(&threshg, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
-	    hypre_MPI_Bcast(&thresh,  1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&threshg, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&thresh,  1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
 	    hypre_MPI_Bcast(&nlevels, 1, HYPRE_MPI_INT,    0, hypre_MPI_COMM_WORLD);
-	    hypre_MPI_Bcast(&filter,  1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
-	    hypre_MPI_Bcast(&loadbal, 1, hypre_MPI_DOUBLE, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&filter,  1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
+	    hypre_MPI_Bcast(&loadbal, 1, hypre_MPI_REAL, 0, hypre_MPI_COMM_WORLD);
 
             if (nlevels < 0)
                 break;
@@ -220,9 +220,9 @@ HYPRE_Int main(HYPRE_Int argc, char *argv[])
         time1 = hypre_MPI_Wtime();
 	solve_time = time1-time0;
 
-        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, 
+        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, 
 	    hypre_MPI_COMM_WORLD);
-        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0, 
+        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0, 
 	    hypre_MPI_COMM_WORLD);
 
 	if (mype == 0)
diff --git a/src/distributed_ls/ParaSails/lapdriver.c b/src/distributed_ls/ParaSails/lapdriver.c
index 08b5e59..d3549f1 100644
--- a/src/distributed_ls/ParaSails/lapdriver.c
+++ b/src/distributed_ls/ParaSails/lapdriver.c
@@ -304,9 +304,9 @@ hcstop();
         ParaSailsStatsPattern(ps, A);
         ParaSailsStatsValues(ps, A);
 
-        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0,
+        hypre_MPI_Reduce(&setup_time, &max_setup_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0,
             hypre_MPI_COMM_WORLD);
-        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, 0,
+        hypre_MPI_Reduce(&solve_time, &max_solve_time, 1, hypre_MPI_REAL, hypre_MPI_MAX, 0,
             hypre_MPI_COMM_WORLD);
 
         if (mype == 0)
diff --git a/src/distributed_ls/pilut/comm.c b/src/distributed_ls/pilut/comm.c
index 6329d11..0cb104d 100644
--- a/src/distributed_ls/pilut/comm.c
+++ b/src/distributed_ls/pilut/comm.c
@@ -77,7 +77,7 @@ HYPRE_Int hypre_GlobalSESum(HYPRE_Int value, MPI_Comm hypre_MPI_Context)
 HYPRE_Real hypre_GlobalSEMaxDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 {
   HYPRE_Real max;
-  hypre_MPI_Allreduce( &value, &max, 1, hypre_MPI_DOUBLE, hypre_MPI_MAX, hypre_MPI_Context );
+  hypre_MPI_Allreduce( &value, &max, 1, hypre_MPI_REAL, hypre_MPI_MAX, hypre_MPI_Context );
 
   return max;
 }
@@ -88,7 +88,7 @@ HYPRE_Real hypre_GlobalSEMaxDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 HYPRE_Real hypre_GlobalSEMinDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 {
   HYPRE_Real min;
-  hypre_MPI_Allreduce( &value, &min, 1, hypre_MPI_DOUBLE, hypre_MPI_MIN, hypre_MPI_Context );
+  hypre_MPI_Allreduce( &value, &min, 1, hypre_MPI_REAL, hypre_MPI_MIN, hypre_MPI_Context );
 
   return min;
 }
@@ -99,7 +99,7 @@ HYPRE_Real hypre_GlobalSEMinDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 HYPRE_Real hypre_GlobalSESumDouble(HYPRE_Real value, MPI_Comm hypre_MPI_Context)
 {
   HYPRE_Real sum;
-  hypre_MPI_Allreduce( &value, &sum, 1, hypre_MPI_DOUBLE, hypre_MPI_SUM, hypre_MPI_Context );
+  hypre_MPI_Allreduce( &value, &sum, 1, hypre_MPI_REAL, hypre_MPI_SUM, hypre_MPI_Context );
 
   return sum;
 }
diff --git a/src/distributed_ls/pilut/parilut.c b/src/distributed_ls/pilut/parilut.c
index a903685..2734cd2 100644
--- a/src/distributed_ls/pilut/parilut.c
+++ b/src/distributed_ls/pilut/parilut.c
@@ -503,7 +503,7 @@ void hypre_SendFactoredRows(FactorMatType *ldu, CommInfoType *cinfo,
     hypre_MPI_Irecv( incolind+j, cnt, HYPRE_MPI_INT,
 	      penum, TAG_Send_colind, pilut_comm, &index_requests[i] );
 
-    hypre_MPI_Irecv( invalues+j, cnt, hypre_MPI_DOUBLE,
+    hypre_MPI_Irecv( invalues+j, cnt, hypre_MPI_REAL,
 	      penum, TAG_Send_values, pilut_comm, &value_requests[i] );
 
     j += cnt;
@@ -550,7 +550,7 @@ void hypre_SendFactoredRows(FactorMatType *ldu, CommInfoType *cinfo,
 
   /* send values to each neighbor */
   for (i=0; i<snnbr; i++) {
-    hypre_MPI_Send( dgatherbuf, l, hypre_MPI_DOUBLE,
+    hypre_MPI_Send( dgatherbuf, l, hypre_MPI_REAL,
 	      snbrind[i], TAG_Send_values, pilut_comm );
   }
 
diff --git a/src/distributed_ls/pilut/trifactor.c b/src/distributed_ls/pilut/trifactor.c
index fe70fd4..dde3881 100644
--- a/src/distributed_ls/pilut/trifactor.c
+++ b/src/distributed_ls/pilut/trifactor.c
@@ -126,7 +126,7 @@ void hypre_LDUSolve(DataDistType *ddist, FactorMatType *ldu, HYPRE_Real *x, HYPR
     /* Recv the required lx elements from the appropriate processors */
     for (i=0; i<rnbrpes; i++) {
       if ( rnum[i] > 0 ) { /* Something to recv */
-	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_DOUBLE,
+	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_REAL,
 		  rpes[i], TAG, pilut_comm, &receive_requests[i] );
 
 	rdone[i] += rnum[i] ;
@@ -139,7 +139,7 @@ void hypre_LDUSolve(DataDistType *ddist, FactorMatType *ldu, HYPRE_Real *x, HYPR
         for (j=auxsptr[i], l=0;   j<sptr[i+1] && sindex[j]<nnodes[ii];   j++, l++) 
           gatherbuf[l] = lx[sindex[j]];
 
-	hypre_MPI_Send( gatherbuf, l, hypre_MPI_DOUBLE,
+	hypre_MPI_Send( gatherbuf, l, hypre_MPI_REAL,
 		  spes[i], TAG, pilut_comm );
 
         auxsptr[i] = j;
@@ -212,7 +212,7 @@ void hypre_LDUSolve(DataDistType *ddist, FactorMatType *ldu, HYPRE_Real *x, HYPR
     /* Recv the required ux elements from the appropriate processors */
     for (i=0; i<rnbrpes; i++) {
       if ( rnum[i] > 0 ) { /* Something to recv */
-	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_DOUBLE,
+	hypre_MPI_Irecv( raddr[i]+rdone[i], rnum[i], hypre_MPI_REAL,
 		  rpes[i], TAG, pilut_comm, &receive_requests[ i ] );
 
 	rdone[i] += rnum[i] ;
@@ -225,7 +225,7 @@ void hypre_LDUSolve(DataDistType *ddist, FactorMatType *ldu, HYPRE_Real *x, HYPR
         for (j=auxsptr[i], l=0;   j<sptr[i+1] && sindex[j]>=nnodes[ii-1];   j++, l++) 
           gatherbuf[l] = ux[sindex[j]];
 
-	hypre_MPI_Send( gatherbuf, l, hypre_MPI_DOUBLE,
+	hypre_MPI_Send( gatherbuf, l, hypre_MPI_REAL,
 		  spes[i], TAG, pilut_comm );
 
         auxsptr[i] = j;
diff --git a/src/examples/README_files/ex12f.f.html b/src/examples/README_files/ex12f.f.html
index 5eae290..af72970 100644
--- a/src/examples/README_files/ex12f.f.html
+++ b/src/examples/README_files/ex12f.f.html
@@ -412,7 +412,7 @@
 <a name="line405">405</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGCreate</font><font color="4444FF">(</font><font color="#2040a0">MPI_COMM_WORLD</font>, <font color="#2040a0">precond</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line406">406</a> <font color="#444444">c        Set PFMG parameters</font>
 <a name="line407">407</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetMaxIter</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">1</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
-<a name="line408">408</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetTol</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">0.0</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
+<a name="line408">408</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetTol</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">0.0d0</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line409">409</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetZeroGuess</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line410">410</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetNumPreRelax</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">2</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
 <a name="line411">411</a>          <strong>call</strong> <font color="#2040a0">HYPRE_StructPFMGSetNumPostRelax</font><font color="4444FF">(</font><font color="#2040a0">precond</font>, <font color="#FF0000">2</font>, <font color="#2040a0">ierr</font><font color="4444FF">)</font>
diff --git a/src/examples/ex12f.f b/src/examples/ex12f.f
index f2240a9..74f200d 100644
--- a/src/examples/ex12f.f
+++ b/src/examples/ex12f.f
@@ -405,7 +405,7 @@ c        Create the Struct PFMG solver for use as a preconditioner
          call HYPRE_StructPFMGCreate(MPI_COMM_WORLD, precond, ierr)
 c        Set PFMG parameters
          call HYPRE_StructPFMGSetMaxIter(precond, 1, ierr)
-         call HYPRE_StructPFMGSetTol(precond, 0.0, ierr)
+         call HYPRE_StructPFMGSetTol(precond, 0.0d0, ierr)
          call HYPRE_StructPFMGSetZeroGuess(precond, ierr)
          call HYPRE_StructPFMGSetNumPreRelax(precond, 2, ierr)
          call HYPRE_StructPFMGSetNumPostRelax(precond, 2, ierr)
diff --git a/src/krylov/bicgstab.c b/src/krylov/bicgstab.c
index 471406b..4307b92 100644
--- a/src/krylov/bicgstab.c
+++ b/src/krylov/bicgstab.c
@@ -19,7 +19,6 @@
 #include "krylov.h"
 #include "_hypre_utilities.h"
 
-
 /*--------------------------------------------------------------------------
  * hypre_BiCGSTABFunctionsCreate
  *--------------------------------------------------------------------------*/
@@ -236,7 +235,7 @@ hypre_BiCGSTABSolve(void  *bicgstab_vdata,
    HYPRE_Int        iter; 
    HYPRE_Int        my_id, num_procs;
    HYPRE_Real alpha, beta, gamma, epsilon, temp, res, r_norm, b_norm;
-   HYPRE_Real epsmac = 1.e-128; 
+   HYPRE_Real epsmac = HYPRE_REAL_MIN; 
    HYPRE_Real ieee_check = 0.;
    HYPRE_Real cf_ave_0 = 0.0;
    HYPRE_Real cf_ave_1 = 0.0;
diff --git a/src/krylov/gmres.c b/src/krylov/gmres.c
index 3d29dc2..7cbadaf 100644
--- a/src/krylov/gmres.c
+++ b/src/krylov/gmres.c
@@ -833,7 +833,6 @@ hypre_GMRESSolve(void  *gmres_vdata,
       (gmres_data -> rel_residual_norm) = r_norm;
 
    if (iter >= max_iter && r_norm > epsilon) hypre_error(HYPRE_ERROR_CONV);
-   
 
    hypre_TFreeF(c,gmres_functions); 
    hypre_TFreeF(s,gmres_functions); 
diff --git a/src/krylov/lobpcg.c b/src/krylov/lobpcg.c
index 78deeb5..9c02e8c 100644
--- a/src/krylov/lobpcg.c
+++ b/src/krylov/lobpcg.c
@@ -176,7 +176,7 @@ HYPRE_Int* activeMask
   for ( i = 0; i < n; i++ ) {
     if ( utilities_FortranMatrixValue( resNorms, i + 1, 1 ) >
 	 utilities_FortranMatrixValue( lambda, i + 1, 1 )*rtol + atol
-	 + DBL_EPSILON ) {
+	 + HYPRE_REAL_EPSILON ) {
       activeMask[i] = 1; 
       notConverged++;
     }
diff --git a/src/krylov/pcg.c b/src/krylov/pcg.c
index f2e6634..b4ee27f 100644
--- a/src/krylov/pcg.c
+++ b/src/krylov/pcg.c
@@ -645,7 +645,7 @@ hypre_PCGSolve( void *pcg_vdata,
          break;
       }
 
-      if ( (gamma<1.0e-292) && ((-gamma)<1.0e-292) ) {
+      if ( (gamma<HYPRE_REAL_MIN) && ((-gamma)<HYPRE_REAL_MIN) ) {
          /* ierr = 1;*/
          hypre_error(HYPRE_ERROR_CONV);
          
@@ -668,7 +668,7 @@ hypre_PCGSolve( void *pcg_vdata,
       if (cf_tol > 0.0)
       {
          cf_ave_0 = cf_ave_1;
-         if ( i_prod_0<1.0e-292 ) {
+         if ( i_prod_0<HYPRE_REAL_MIN ) {
             /* i_prod_0 is zero, or (almost) subnormal, yet i_prod wasn't small
                enough to pass the convergence test.  Therefore initial guess was good,
                and we're just calculating garbage - time to bail out before the
diff --git a/src/lapack/dbdsqr.c b/src/lapack/dbdsqr.c
index 3c4da16..1fc7a32 100644
--- a/src/lapack/dbdsqr.c
+++ b/src/lapack/dbdsqr.c
@@ -25,7 +25,7 @@ static doublereal c_b72 = -1.;
     doublereal d__1, d__2, d__3, d__4;
 
     /* Builtin functions */
-    HYPRE_Real pow_dd(doublereal *, doublereal *), sqrt(doublereal), d_sign(
+    HYPRE_Real pow_dd(doublereal *, doublereal *), d_sign(
 	    doublereal *, doublereal *);
 
     /* Local variables */
diff --git a/src/lapack/dgesvd.c b/src/lapack/dgesvd.c
index 5fd381f..490685b 100644
--- a/src/lapack/dgesvd.c
+++ b/src/lapack/dgesvd.c
@@ -30,7 +30,6 @@ static doublereal c_b438 = 1.;
 
     /* Builtin functions   
        Subroutine */ HYPRE_Int s_cat(char *, char **, integer *, integer *, ftnlen);
-    HYPRE_Real sqrt(doublereal);
 
     /* Local variables */
     static integer iscl;
diff --git a/src/lapack/dlabad.c b/src/lapack/dlabad.c
index b18aed0..b7df7f5 100644
--- a/src/lapack/dlabad.c
+++ b/src/lapack/dlabad.c
@@ -41,7 +41,7 @@
        If it looks like we're on a Cray, take the square root of   
        SMALL and LARGE to avoid overflow and underflow problems. */
     /* Builtin functions */
-    HYPRE_Real d_lg10(doublereal *), sqrt(doublereal);
+    HYPRE_Real d_lg10(doublereal *);
 
 
     if (d_lg10(large) > 2e3) {
diff --git a/src/lapack/dlae2.c b/src/lapack/dlae2.c
index 0028e99..f967de4 100644
--- a/src/lapack/dlae2.c
+++ b/src/lapack/dlae2.c
@@ -58,8 +58,6 @@
        Compute the eigenvalues */
     /* System generated locals */
     doublereal d__1;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal acmn, acmx, ab, df, tb, sm, rt, adf;
 
diff --git a/src/lapack/dlaev2.c b/src/lapack/dlaev2.c
index 1aa035b..c2d1bf1 100644
--- a/src/lapack/dlaev2.c
+++ b/src/lapack/dlaev2.c
@@ -69,8 +69,6 @@
        Compute the eigenvalues */
     /* System generated locals */
     doublereal d__1;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal acmn, acmx, ab, df, cs, ct, tb, sm, tn, rt, adf, acs;
     static integer sgn1, sgn2;
diff --git a/src/lapack/dlanst.c b/src/lapack/dlanst.c
index 0d5721f..706d782 100644
--- a/src/lapack/dlanst.c
+++ b/src/lapack/dlanst.c
@@ -62,8 +62,6 @@ doublereal dlanst_(const char *norm, integer *n, doublereal *d__, doublereal *e)
     /* System generated locals */
     integer i__1;
     doublereal ret_val, d__1, d__2, d__3, d__4, d__5;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static integer i__;
     static doublereal scale;
diff --git a/src/lapack/dlartg.c b/src/lapack/dlartg.c
index a98beec..5094f1c 100644
--- a/src/lapack/dlartg.c
+++ b/src/lapack/dlartg.c
@@ -61,17 +61,18 @@
     static integer count;
     static doublereal f1, g1, safmn2, safmx2;
     extern doublereal dlamch_(const char *);
-    static doublereal safmin, eps;
+//    static doublereal safmin, eps;
 
 
 
     if (first) {
 	first = FALSE_;
-	safmin = dlamch_("S");
-	eps = dlamch_("E");
+//	safmin = dlamch_("S");
+//	eps = dlamch_("E");
 	d__1 = dlamch_("B");
-	i__1 = (integer) (log(safmin / eps) / log(dlamch_("B")) / 
-		2.);
+//	i__1 = (integer) (log(safmin / eps) / log(dlamch_("B")) / 
+//		2.);
+        i__1 = HYPRE_REAL_MIN_EXP>>1;
 	safmn2 = pow_di(&d__1, &i__1);
 	safmx2 = 1. / safmn2;
     }
diff --git a/src/lapack/dlas2.c b/src/lapack/dlas2.c
index 265dc4a..8a52d4e 100644
--- a/src/lapack/dlas2.c
+++ b/src/lapack/dlas2.c
@@ -60,8 +60,6 @@
     ==================================================================== */
     /* System generated locals */
     doublereal d__1, d__2;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal fhmn, fhmx, c__, fa, ga, ha, as, at, au;
 
diff --git a/src/lapack/dlasq1.c b/src/lapack/dlasq1.c
index c3ee476..6c6bc44 100644
--- a/src/lapack/dlasq1.c
+++ b/src/lapack/dlasq1.c
@@ -20,9 +20,6 @@ static integer c__0 = 0;
     integer i__1, i__2;
     doublereal d__1, d__2, d__3;
 
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
-
     /* Local variables */
     extern /* Subroutine */ HYPRE_Int dlas2_(doublereal *, doublereal *, doublereal 
 	    *, doublereal *, doublereal *);
diff --git a/src/lapack/dlasq2.c b/src/lapack/dlasq2.c
index dbf05e4..d3b8477 100644
--- a/src/lapack/dlasq2.c
+++ b/src/lapack/dlasq2.c
@@ -22,9 +22,6 @@ static integer c__11 = 11;
     integer i__1, i__2, i__3;
     doublereal d__1, d__2;
 
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
-
     /* Local variables */
     static logical ieee;
     static integer nbig;
diff --git a/src/lapack/dlasq3.c b/src/lapack/dlasq3.c
index cf0034c..80d7cd9 100644
--- a/src/lapack/dlasq3.c
+++ b/src/lapack/dlasq3.c
@@ -76,8 +76,6 @@
     /* System generated locals */
     integer i__1;
     doublereal d__1, d__2;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static doublereal temp, s, t;
     static integer j4;
diff --git a/src/lapack/dlasq4.c b/src/lapack/dlasq4.c
index e08e9e3..73a0521 100644
--- a/src/lapack/dlasq4.c
+++ b/src/lapack/dlasq4.c
@@ -20,9 +20,6 @@
     integer i__1;
     doublereal d__1, d__2;
 
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
-
     /* Local variables */
     static doublereal s, a2, b1, b2;
     static integer i4, nn, np;
diff --git a/src/lapack/dlasv2.c b/src/lapack/dlasv2.c
index d9016c6..04829f0 100644
--- a/src/lapack/dlasv2.c
+++ b/src/lapack/dlasv2.c
@@ -83,7 +83,7 @@
     /* System generated locals */
     doublereal d__1;
     /* Builtin functions */
-    HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *);
+    HYPRE_Real d_sign(doublereal *, doublereal *);
     /* Local variables */
     static integer pmax;
     static doublereal temp;
diff --git a/src/lapack/dsteqr.c b/src/lapack/dsteqr.c
index 2107504..aee5c12 100644
--- a/src/lapack/dsteqr.c
+++ b/src/lapack/dsteqr.c
@@ -89,7 +89,7 @@
     integer z_dim1, z_offset, i__1, i__2;
     doublereal d__1, d__2;
     /* Builtin functions */
-    HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *);
+    HYPRE_Real d_sign(doublereal *, doublereal *);
     /* Local variables */
     static integer lend, jtot;
     extern /* Subroutine */ HYPRE_Int dlae2_(doublereal *, doublereal *, doublereal 
diff --git a/src/lapack/dsterf.c b/src/lapack/dsterf.c
index 4a8e65a..0f76e42 100644
--- a/src/lapack/dsterf.c
+++ b/src/lapack/dsterf.c
@@ -54,7 +54,7 @@
     integer i__1;
     doublereal d__1, d__2, d__3;
     /* Builtin functions */
-    HYPRE_Real sqrt(doublereal), d_sign(doublereal *, doublereal *);
+    HYPRE_Real d_sign(doublereal *, doublereal *);
     /* Local variables */
     static doublereal oldc;
     static integer lend, jtot;
diff --git a/src/lapack/dsyev.c b/src/lapack/dsyev.c
index 93c7bcf..a51fa9a 100644
--- a/src/lapack/dsyev.c
+++ b/src/lapack/dsyev.c
@@ -85,8 +85,6 @@
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
     doublereal d__1;
-    /* Builtin functions */
-    HYPRE_Real sqrt(doublereal);
     /* Local variables */
     static integer inde;
     static doublereal anrm;
diff --git a/src/lapack/hypre_lapack.h b/src/lapack/hypre_lapack.h
index 48faad2..1a2ceff 100644
--- a/src/lapack/hypre_lapack.h
+++ b/src/lapack/hypre_lapack.h
@@ -22,6 +22,12 @@
 extern "C" {
 #endif
 
+#if defined(HYPRE_SINGLE)
+#define sqrt sqrtf
+#elif defined(HYPRE_LONG_DOUBLE)
+#define sqrt sqrtl
+#endif
+
 /* --------------------------------------------------------------------------
  *  Change all names to hypre_ to avoid link conflicts
  * --------------------------------------------------------------------------*/
diff --git a/src/multivector/backup.c b/src/multivector/backup.c
index 6b17c26..19228c9 100644
--- a/src/multivector/backup.c
+++ b/src/multivector/backup.c
@@ -132,10 +132,10 @@ hypre_TempMultiVectorSetRandom( void* v, HYPRE_Int seed ) {
 
   hypre_assert( data != NULL );
 
-  srand( seed );
+  hypre_SeedRand( seed );
   for ( i = 0; i < data->numVectors; i++ ) {
     if ( data->mask == NULL || (data->mask)[i] ) {
-      seed = rand();
+      seed = hypre_RandI();
       (data->interpreter->SetRandomValues)(data->vector[i],seed);
     }
   }
diff --git a/src/multivector/csr_matmultivec.c b/src/multivector/csr_matmultivec.c
index 78770b3..7dc8dbb 100644
--- a/src/multivector/csr_matmultivec.c
+++ b/src/multivector/csr_matmultivec.c
@@ -114,8 +114,8 @@ hypre_CSRMatrixMatMultivec(HYPRE_Complex alpha, hypre_CSRMatrix *A,
          temp = y_data[i];
          for (jj = A_i[i]; jj < A_i[i+1]; jj++)
             temp += A_data[jj] * x_data[A_j[jj]];
+         y_data[i] = temp;
       }
-      y_data[i] = temp;
    }
    else
    {
diff --git a/src/parcsr_ls/F90_HYPRE_parcsr_amg.c b/src/parcsr_ls/F90_HYPRE_parcsr_amg.c
index 3ba55a9..dcd8909 100644
--- a/src/parcsr_ls/F90_HYPRE_parcsr_amg.c
+++ b/src/parcsr_ls/F90_HYPRE_parcsr_amg.c
@@ -1743,6 +1743,54 @@ hypre_F90_IFACE(hypre_boomeramgsetchebyfract, HYPRE_BOOMERAMGSETCHEBYFRACT)
 }
 
 /*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyScale
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetchebyscale, HYPRE_BOOMERAMGSETCHEBYSCALE)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *cheby_scale,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetChebyScale(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (cheby_scale) ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyVariant
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetchebyvariant, HYPRE_BOOMERAMGSETCHEBYVARIANT)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *cheby_variant,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetChebyVariant(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (cheby_variant) ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyEigEst
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetchebyeigest, HYPRE_BOOMERAMGSETCHEBYEIGEST)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *cheby_eig_est,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetChebyEigEst(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (cheby_eig_est) ) );
+}
+
+/*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetKeepTranspose
  *--------------------------------------------------------------------------*/
 
@@ -1775,7 +1823,7 @@ hypre_F90_IFACE(hypre_boomeramgsetrap2, HYPRE_BOOMERAMGSETRAP2)
 }
 
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetAdditive
+ * HYPRE_BoomerAMGSetAdditive, HYPRE_BoomerAMGGetAdditive
  *--------------------------------------------------------------------------*/
 
 void
@@ -1790,8 +1838,20 @@ hypre_F90_IFACE(hypre_boomeramgsetadditive, HYPRE_BOOMERAMGSETADDITIVE)
            hypre_F90_PassInt (add_lvl) ) );
 }
 
+void
+hypre_F90_IFACE(hypre_boomeramggetadditive, HYPRE_BOOMERAMGGETADDITIVE)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGGetAdditive(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassIntRef (add_lvl) ) );
+}
+
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetMultAdditive
+ * HYPRE_BoomerAMGSetMultAdditive, HYPRE BoomerAMGGetMultAdditive
  *--------------------------------------------------------------------------*/
 
 void
@@ -1806,8 +1866,20 @@ hypre_F90_IFACE(hypre_boomeramgsetmultadd, HYPRE_BOOMERAMGSETMULTADD)
            hypre_F90_PassInt (add_lvl) ) );
 }
 
+void
+hypre_F90_IFACE(hypre_boomeramggetmultadd, HYPRE_BOOMERAMGGETMULTADD)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGGetMultAdditive(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassIntRef (add_lvl) ) );
+}
+
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetSimple
+ * HYPRE_BoomerAMGSetSimple, HYPRE_BoomerAMGGetSimple
  *--------------------------------------------------------------------------*/
 
 void
@@ -1822,6 +1894,34 @@ hypre_F90_IFACE(hypre_boomeramgsetsimple, HYPRE_BOOMERAMGSETSIMPLE)
            hypre_F90_PassInt (add_lvl) ) );
 }
 
+void
+hypre_F90_IFACE(hypre_boomeramggetsimple, HYPRE_BOOMERAMGGETSIMPLE)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGGetSimple(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassIntRef (add_lvl) ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetAddLastLvl
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_F90_IFACE(hypre_boomeramgsetaddlastlvl, HYPRE_BOOMERAMGSETADDLASTLVL)
+   ( hypre_F90_Obj *solver,
+     hypre_F90_Int *add_last_lvl,
+     hypre_F90_Int *ierr          )
+{
+   *ierr = (hypre_F90_Int)
+      ( HYPRE_BoomerAMGSetAddLastLvl(
+           hypre_F90_PassObj (HYPRE_Solver, solver),
+           hypre_F90_PassInt (add_last_lvl) ) );
+}
+
 /*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetMultAddTruncFactor
  *--------------------------------------------------------------------------*/
diff --git a/src/parcsr_ls/HYPRE_parcsr_amg.c b/src/parcsr_ls/HYPRE_parcsr_amg.c
index 2a4dbd7..d6733f8 100644
--- a/src/parcsr_ls/HYPRE_parcsr_amg.c
+++ b/src/parcsr_ls/HYPRE_parcsr_amg.c
@@ -1466,7 +1466,7 @@ HYPRE_BoomerAMGSetChebyOrder( HYPRE_Solver  solver,
    return( hypre_BoomerAMGSetChebyOrder( (void *) solver, order ) );
 }
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetChebyEigRatio
+ * HYPRE_BoomerAMGSetChebyFraction
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
@@ -1475,10 +1475,43 @@ HYPRE_BoomerAMGSetChebyFraction( HYPRE_Solver  solver,
 {
    return( hypre_BoomerAMGSetChebyFraction( (void *) solver, ratio ) );
 }
+
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetInterpVectors
+ * HYPRE_BoomerAMGSetChebyScale
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetChebyScale( HYPRE_Solver  solver,
+                                 HYPRE_Int     scale )
+{
+   return( hypre_BoomerAMGSetChebyScale( (void *) solver, scale ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyVariant
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetChebyVariant( HYPRE_Solver  solver,
+                                 HYPRE_Int     variant )
+{
+   return( hypre_BoomerAMGSetChebyVariant( (void *) solver, variant ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetChebyEigEst
  *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetChebyEigEst( HYPRE_Solver  solver,
+                                 HYPRE_Int     eig_est )
+{
+   return( hypre_BoomerAMGSetChebyEigEst( (void *) solver, eig_est ) );
+}
                                                                                                        
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetInterpVectors
+ *--------------------------------------------------------------------------*/
 HYPRE_Int
 HYPRE_BoomerAMGSetInterpVectors (HYPRE_Solver solver, HYPRE_Int num_vectors,
                                  HYPRE_ParVector *vectors)
@@ -1604,6 +1637,17 @@ HYPRE_BoomerAMGGetSimple( HYPRE_Solver solver,
 }
 
 /*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetAddLastLvl
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetAddLastLvl( HYPRE_Solver solver,
+                          HYPRE_Int          add_last_lvl  )
+{
+   return( hypre_BoomerAMGSetAddLastLvl( (void *) solver, add_last_lvl ) );
+}
+
+/*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetNonGalerkinTol
  *--------------------------------------------------------------------------*/
 
diff --git a/src/parcsr_ls/HYPRE_parcsr_ls.h b/src/parcsr_ls/HYPRE_parcsr_ls.h
index c5e35c2..41ea7ec 100644
--- a/src/parcsr_ls/HYPRE_parcsr_ls.h
+++ b/src/parcsr_ls/HYPRE_parcsr_ls.h
@@ -582,6 +582,16 @@ HYPRE_Int HYPRE_BoomerAMGSetSimple(HYPRE_Solver solver,
                                    HYPRE_Int    addlvl);
 
 /**
+ * (Optional) Defines last level where additive, mult-additive
+ * or simple cycle is used.
+ * The multiplicative approach is used on levels > add_last_lvl.
+ * 
+ * Can only be used when AMG is used as a preconditioner !!!
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetAddLastLvl(HYPRE_Solver solver,
+                                     HYPRE_Int    add_last_lvl);
+
+/**
  * (Optional) Defines the truncation factor for the 
  * smoothed interpolation used for mult-additive or simple method.
  * The default is 0.
@@ -838,6 +848,31 @@ HYPRE_Int HYPRE_BoomerAMGSetChebyOrder(HYPRE_Solver solver,
 HYPRE_Int HYPRE_BoomerAMGSetChebyFraction (HYPRE_Solver solver,
                                            HYPRE_Real   ratio);
 
+/*
+ * (Optional) Defines whether matrix should be scaled.
+ *  The default is 1 (i.e., scaled).
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetChebyScale (HYPRE_Solver solver,
+                                           HYPRE_Int   scale);
+/*
+ * (Optional) Defines which polynomial variant should be used.
+ *  The default is 0 (i.e., scaled).
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetChebyVariant (HYPRE_Solver solver,
+                                           HYPRE_Int   variant);
+
+/*
+ * (Optional) Defines how to estimate eigenvalues.
+ *  The default is 10 (i.e., 10 CG iterations are used to find extreme 
+ *  eigenvalues.) If eig_est=0, the largest eigenvalue is estimated
+ *  using Gershgorin, the smallest is set to 0.
+ *  If eig_est is a positive number n, n iterations of CG are used to
+ *  determine the smallest and largest eigenvalue.
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetChebyEigEst (HYPRE_Solver solver,
+                                           HYPRE_Int   eig_est);
+
+
 /**
  * (Optional) Enables the use of more complex smoothers.
  * The following options exist for smooth\_type:
@@ -3129,6 +3164,21 @@ GenerateVarDifConv(MPI_Comm         comm,
                    HYPRE_Real       eps,
                    HYPRE_ParVector *rhs_ptr);
 
+HYPRE_ParCSRMatrix
+GenerateRSVarDifConv(MPI_Comm         comm,
+                     HYPRE_Int        nx,
+                     HYPRE_Int        ny,
+                     HYPRE_Int        nz,
+                     HYPRE_Int        P,
+                     HYPRE_Int        Q,
+                     HYPRE_Int        R,
+                     HYPRE_Int        p,
+                     HYPRE_Int        q,
+                     HYPRE_Int        r,
+                     HYPRE_Real       eps,
+                     HYPRE_ParVector *rhs_ptr,
+                     HYPRE_Int        type);
+
 float*
 GenerateCoordinates(MPI_Comm  comm,
                     HYPRE_Int nx,
diff --git a/src/parcsr_ls/Makefile b/src/parcsr_ls/Makefile
index 2ef785e..c1508d7 100644
--- a/src/parcsr_ls/Makefile
+++ b/src/parcsr_ls/Makefile
@@ -86,6 +86,7 @@ FILES =\
  par_cg_relax_wt.c\
  par_coarsen.c\
  par_cgc_coarsen.c\
+ par_cheby.c\
  par_coarse_parms.c\
  par_coordinates.c\
  par_cr.c\
@@ -107,6 +108,7 @@ FILES =\
  par_rap_communication.c\
  par_rotate_7pt.c\
  par_vardifconv.c\
+ par_vardifconv_rs.c\
  par_relax.c\
  par_relax_more.c\
  par_relax_interface.c\
diff --git a/src/parcsr_ls/_hypre_parcsr_ls.h b/src/parcsr_ls/_hypre_parcsr_ls.h
index 22807d1..d2ddf36 100644
--- a/src/parcsr_ls/_hypre_parcsr_ls.h
+++ b/src/parcsr_ls/_hypre_parcsr_ls.h
@@ -167,8 +167,13 @@ typedef struct
 
    HYPRE_Real          *max_eig_est;
    HYPRE_Real          *min_eig_est;
-   HYPRE_Int                  cheby_order;
+   HYPRE_Int           cheby_eig_est;
+   HYPRE_Int            cheby_order;
+   HYPRE_Int           cheby_variant;
+   HYPRE_Int           cheby_scale;
    HYPRE_Real           cheby_fraction;
+   HYPRE_Real         **cheby_ds;
+   HYPRE_Real         **cheby_coefs;
 
    /* data needed for non-Galerkin option */
    HYPRE_Int           nongalerk_num_tol;
@@ -240,11 +245,13 @@ typedef struct
    HYPRE_Int      additive;
    HYPRE_Int      mult_additive;
    HYPRE_Int      simple;
+   HYPRE_Int      add_last_lvl;
    HYPRE_Int      add_P_max_elmts;
    HYPRE_Real     add_trunc_factor;
    HYPRE_Int      add_rlx_type;
    HYPRE_Real     add_rlx_wt;
    hypre_ParCSRMatrix *Lambda;
+   hypre_ParCSRMatrix *Atilde;
    hypre_ParVector *Rtilde;
    hypre_ParVector *Xtilde;
    HYPRE_Real *D_inv;
@@ -363,7 +370,11 @@ typedef struct
 #define hypre_ParAMGDataMinEigEst(amg_data) ((amg_data)->min_eig_est)	
 #define hypre_ParAMGDataChebyOrder(amg_data) ((amg_data)->cheby_order)
 #define hypre_ParAMGDataChebyFraction(amg_data) ((amg_data)->cheby_fraction)
-
+#define hypre_ParAMGDataChebyEigEst(amg_data) ((amg_data)->cheby_eig_est)
+#define hypre_ParAMGDataChebyVariant(amg_data) ((amg_data)->cheby_variant)
+#define hypre_ParAMGDataChebyScale(amg_data) ((amg_data)->cheby_scale)
+#define hypre_ParAMGDataChebyDS(amg_data) ((amg_data)->cheby_ds)
+#define hypre_ParAMGDataChebyCoefs(amg_data) ((amg_data)->cheby_coefs)
 
 /* block */
 #define hypre_ParAMGDataABlockArray(amg_data) ((amg_data)->A_block_array)
@@ -437,11 +448,13 @@ typedef struct
 #define hypre_ParAMGDataAdditive(amg_data) ((amg_data)->additive)
 #define hypre_ParAMGDataMultAdditive(amg_data) ((amg_data)->mult_additive)
 #define hypre_ParAMGDataSimple(amg_data) ((amg_data)->simple)
+#define hypre_ParAMGDataAddLastLvl(amg_data) ((amg_data)->add_last_lvl)
 #define hypre_ParAMGDataMultAddPMaxElmts(amg_data) ((amg_data)->add_P_max_elmts)
 #define hypre_ParAMGDataMultAddTruncFactor(amg_data) ((amg_data)->add_trunc_factor)
 #define hypre_ParAMGDataAddRelaxType(amg_data) ((amg_data)->add_rlx_type)
 #define hypre_ParAMGDataAddRelaxWt(amg_data) ((amg_data)->add_rlx_wt)
 #define hypre_ParAMGDataLambda(amg_data) ((amg_data)->Lambda)
+#define hypre_ParAMGDataAtilde(amg_data) ((amg_data)->Atilde)
 #define hypre_ParAMGDataRtilde(amg_data) ((amg_data)->Rtilde)
 #define hypre_ParAMGDataXtilde(amg_data) ((amg_data)->Xtilde)
 #define hypre_ParAMGDataDinv(amg_data) ((amg_data)->D_inv)
@@ -839,6 +852,9 @@ HYPRE_Int HYPRE_BoomerAMGSetCoordDim ( HYPRE_Solver solver , HYPRE_Int coorddim
 HYPRE_Int HYPRE_BoomerAMGSetCoordinates ( HYPRE_Solver solver , float *coordinates );
 HYPRE_Int HYPRE_BoomerAMGSetChebyOrder ( HYPRE_Solver solver , HYPRE_Int order );
 HYPRE_Int HYPRE_BoomerAMGSetChebyFraction ( HYPRE_Solver solver , HYPRE_Real ratio );
+HYPRE_Int HYPRE_BoomerAMGSetChebyEigEst ( HYPRE_Solver solver , HYPRE_Int eig_est );
+HYPRE_Int HYPRE_BoomerAMGSetChebyVariant ( HYPRE_Solver solver , HYPRE_Int variant );
+HYPRE_Int HYPRE_BoomerAMGSetChebyScale ( HYPRE_Solver solver , HYPRE_Int scale );
 HYPRE_Int HYPRE_BoomerAMGSetInterpVectors ( HYPRE_Solver solver , HYPRE_Int num_vectors , HYPRE_ParVector *vectors );
 HYPRE_Int HYPRE_BoomerAMGSetInterpVecVariant ( HYPRE_Solver solver , HYPRE_Int num );
 HYPRE_Int HYPRE_BoomerAMGSetInterpVecQMax ( HYPRE_Solver solver , HYPRE_Int q_max );
@@ -852,6 +868,7 @@ HYPRE_Int HYPRE_BoomerAMGSetMultAdditive ( HYPRE_Solver solver , HYPRE_Int mult_
 HYPRE_Int HYPRE_BoomerAMGGetMultAdditive ( HYPRE_Solver solver , HYPRE_Int *mult_additive );
 HYPRE_Int HYPRE_BoomerAMGSetSimple ( HYPRE_Solver solver , HYPRE_Int simple );
 HYPRE_Int HYPRE_BoomerAMGGetSimple ( HYPRE_Solver solver , HYPRE_Int *simple );
+HYPRE_Int HYPRE_BoomerAMGSetAddLastLvl ( HYPRE_Solver solver , HYPRE_Int add_last_lvl );
 HYPRE_Int HYPRE_BoomerAMGSetNonGalerkinTol ( HYPRE_Solver solver , HYPRE_Real nongalerkin_tol );
 HYPRE_Int HYPRE_BoomerAMGSetLevelNonGalerkinTol ( HYPRE_Solver solver , HYPRE_Real nongalerkin_tol , HYPRE_Int level );
 HYPRE_Int HYPRE_BoomerAMGSetNonGalerkTol ( HYPRE_Solver solver , HYPRE_Int nongalerk_num_tol , HYPRE_Real *nongalerk_tol );
@@ -1261,6 +1278,9 @@ HYPRE_Int hypre_BoomerAMGSetEuSparseA ( void *data , HYPRE_Real eu_sparse_A );
 HYPRE_Int hypre_BoomerAMGSetEuBJ ( void *data , HYPRE_Int eu_bj );
 HYPRE_Int hypre_BoomerAMGSetChebyOrder ( void *data , HYPRE_Int order );
 HYPRE_Int hypre_BoomerAMGSetChebyFraction ( void *data , HYPRE_Real ratio );
+HYPRE_Int hypre_BoomerAMGSetChebyEigEst ( void *data , HYPRE_Int eig_est );
+HYPRE_Int hypre_BoomerAMGSetChebyVariant ( void *data , HYPRE_Int variant );
+HYPRE_Int hypre_BoomerAMGSetChebyScale ( void *data , HYPRE_Int scale );
 HYPRE_Int hypre_BoomerAMGSetInterpVectors ( void *solver , HYPRE_Int num_vectors , hypre_ParVector **interp_vectors );
 HYPRE_Int hypre_BoomerAMGSetInterpVecVariant ( void *solver , HYPRE_Int var );
 HYPRE_Int hypre_BoomerAMGSetInterpVecQMax ( void *data , HYPRE_Int q_max );
@@ -1274,6 +1294,7 @@ HYPRE_Int hypre_BoomerAMGSetMultAdditive ( void *data , HYPRE_Int mult_additive
 HYPRE_Int hypre_BoomerAMGGetMultAdditive ( void *data , HYPRE_Int *mult_additive );
 HYPRE_Int hypre_BoomerAMGSetSimple ( void *data , HYPRE_Int simple );
 HYPRE_Int hypre_BoomerAMGGetSimple ( void *data , HYPRE_Int *simple );
+HYPRE_Int hypre_BoomerAMGSetAddLastLvl ( void *data , HYPRE_Int add_last_lvl );
 HYPRE_Int hypre_BoomerAMGSetNonGalerkinTol ( void *data , HYPRE_Real nongalerkin_tol );
 HYPRE_Int hypre_BoomerAMGSetLevelNonGalerkinTol ( void *data , HYPRE_Real nongalerkin_tol , HYPRE_Int level );
 HYPRE_Int hypre_BoomerAMGSetNonGalerkTol ( void *data , HYPRE_Int nongalerk_num_tol , HYPRE_Real *nongalerk_tol );
@@ -1303,6 +1324,10 @@ HYPRE_Int hypre_AmgCGCBoundaryFix ( hypre_ParCSRMatrix *S , HYPRE_Int *CF_marker
 HYPRE_Int hypre_BoomerAMGCGRelaxWt ( void *amg_vdata , HYPRE_Int level , HYPRE_Int num_cg_sweeps , HYPRE_Real *rlx_wt_ptr );
 HYPRE_Int hypre_Bisection ( HYPRE_Int n , HYPRE_Real *diag , HYPRE_Real *offd , HYPRE_Real y , HYPRE_Real z , HYPRE_Real tol , HYPRE_Int k , HYPRE_Real *ev_ptr );
 
+/* par_cheby.c */
+HYPRE_Int hypre_ParCSRRelax_Cheby_Setup ( hypre_ParCSRMatrix *A , HYPRE_Real max_eig , HYPRE_Real min_eig , HYPRE_Real fraction , HYPRE_Int order , HYPRE_Int scale , HYPRE_Int variant , HYPRE_Real **coefs_ptr , HYPRE_Real **ds_ptr );
+HYPRE_Int hypre_ParCSRRelax_Cheby_Solve ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Real *ds_data , HYPRE_Real *coefs , HYPRE_Int order , HYPRE_Int scale , HYPRE_Int variant , hypre_ParVector *u , hypre_ParVector *v , hypre_ParVector *r );
+
 /* par_coarsen.c */
 HYPRE_Int hypre_BoomerAMGCoarsen ( hypre_ParCSRMatrix *S , hypre_ParCSRMatrix *A , HYPRE_Int CF_init , HYPRE_Int debug_flag , HYPRE_Int **CF_marker_ptr );
 HYPRE_Int hypre_BoomerAMGCoarsenRuge ( hypre_ParCSRMatrix *S , hypre_ParCSRMatrix *A , HYPRE_Int measure_type , HYPRE_Int coarsen_type , HYPRE_Int debug_flag , HYPRE_Int **CF_marker_ptr );
@@ -1430,7 +1455,7 @@ HYPRE_Int hypre_GenerateSendMapAndCommPkg ( MPI_Comm comm , HYPRE_Int num_sends
 HYPRE_Int hypre_BoomerAMGRelax ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Int *cf_marker , HYPRE_Int relax_type , HYPRE_Int relax_points , HYPRE_Real relax_weight , HYPRE_Real omega , HYPRE_Real *l1_norms , hypre_ParVector *u , hypre_ParVector *Vtemp , hypre_ParVector *Ztemp );
 HYPRE_Int hypre_GaussElimSetup ( hypre_ParAMGData *amg_data , HYPRE_Int level , HYPRE_Int relax_type );
 HYPRE_Int hypre_GaussElimSolve ( hypre_ParAMGData *amg_data , HYPRE_Int level , HYPRE_Int relax_type );
-HYPRE_Int gselim ( HYPRE_Real *A , HYPRE_Real *x , HYPRE_Int n );
+HYPRE_CUDA_GLOBAL HYPRE_Int gselim ( HYPRE_Real *A , HYPRE_Real *x , HYPRE_Int n );
 
 /* par_relax_interface.c */
 HYPRE_Int hypre_BoomerAMGRelaxIF ( hypre_ParCSRMatrix *A , hypre_ParVector *f , HYPRE_Int *cf_marker , HYPRE_Int relax_type , HYPRE_Int relax_order , HYPRE_Int cycle_type , HYPRE_Real relax_weight , HYPRE_Real omega , HYPRE_Real *l1_norms , hypre_ParVector *u , hypre_ParVector *Vtemp , hypre_ParVector *Ztemp );
@@ -1509,6 +1534,19 @@ HYPRE_Real gfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
 HYPRE_Real rfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
 HYPRE_Real bndfun ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
 
+/* par_vardifconv_rs.c */
+HYPRE_ParCSRMatrix GenerateRSVarDifConv ( MPI_Comm comm , HYPRE_Int nx , HYPRE_Int ny , HYPRE_Int nz , HYPRE_Int P , HYPRE_Int Q , HYPRE_Int R , HYPRE_Int p , HYPRE_Int q , HYPRE_Int r , HYPRE_Real eps , HYPRE_ParVector *rhs_ptr, HYPRE_Int type );
+HYPRE_Real afun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real bfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real cfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real dfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real efun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real ffun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real gfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real rfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+HYPRE_Real bndfun_rs ( HYPRE_Real xx , HYPRE_Real yy , HYPRE_Real zz );
+
+
 /* pcg_par.c */
 char *hypre_ParKrylovCAlloc ( HYPRE_Int count , HYPRE_Int elt_size );
 HYPRE_Int hypre_ParKrylovFree ( char *ptr );
diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c
index 95a9229..458292e 100644
--- a/src/parcsr_ls/ams.c
+++ b/src/parcsr_ls/ams.c
@@ -10,10 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
-
-
 #include "_hypre_parcsr_ls.h"
 #include "float.h"
 #include "ams.h"
@@ -24,7 +20,7 @@
  * Relaxation on the ParCSR matrix A with right-hand side f and
  * initial guess u. Possible values for relax_type are:
  *
- * 1 = l1-scaled Jacobi
+ * 1 = l1-scaled (or weighted) Jacobi
  * 2 = l1-scaled block Gauss-Seidel/SSOR
  * 3 = Kaczmarz
  * 4 = truncated version of 2 (Remark 6.2 in smoothers paper)
@@ -70,14 +66,27 @@ HYPRE_Int hypre_ParCSRRelax(/* matrix to relax with */
    {
       if (relax_type == 1) /* l1-scaled Jacobi */
       {
-         HYPRE_Int i, num_rows = hypre_ParCSRMatrixNumRows(A);
-
+	 PUSH_RANGE_PAYLOAD("RELAX",4,sweep);
+	 HYPRE_Int i, num_rows = hypre_ParCSRMatrixNumRows(A);
+#ifdef HYPRE_USE_GPU
+	 if (sweep==0){
+	   hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(v));
+	   hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(f));
+	 }
+	 VecCopy(v_data,f_data,hypre_VectorSize(hypre_ParVectorLocalVector(v)),HYPRE_STREAM(4));
+#else
          hypre_ParVectorCopy(f,v);
+#endif
          hypre_ParCSRMatrixMatvec(-relax_weight, A, u, relax_weight, v);
-
+#ifdef HYPRE_USE_GPU
+	 
+	 VecScale(u_data,v_data,l1_norms,num_rows,HYPRE_STREAM(4));
+#else
          /* u += w D^{-1}(f - A u), where D_ii = ||A(i,:)||_1 */
          for (i = 0; i < num_rows; i++)
             u_data[i] += v_data[i] / l1_norms[i];
+#endif
+	 POP_RANGE;
       }
       else if (relax_type == 2 || relax_type == 4) /* offd-l1-scaled block GS */
       {
@@ -704,6 +713,18 @@ HYPRE_Int hypre_ParCSRComputeL1Norms(hypre_ParCSRMatrix *A,
             l1_norm[i] = diag;
       }
    }
+   else if (option == 5) /*stores diagonal of A for Jacobi using matvec, rlx 7 */
+   {
+      for (i = 0; i < num_rows; i++)
+      {
+         diag = A_diag_data[A_diag_I[i]];
+         if (diag != 0.0) l1_norm[i] = diag;
+	 else l1_norm[i] = 1.0;
+      }
+      *l1_norm_ptr = l1_norm;
+
+      return hypre_error_flag;
+   }
 
    /* Handle negative definite matrices */
    for (i = 0; i < num_rows; i++)
@@ -718,6 +739,7 @@ HYPRE_Int hypre_ParCSRComputeL1Norms(hypre_ParCSRMatrix *A,
          break;
       }
 
+   //for (i = 0; i < num_rows; i++) l1_norm[i]=1.0/l1_norm[i];
    hypre_TFree(cf_marker_offd);
 
    *l1_norm_ptr = l1_norm;
@@ -1077,7 +1099,7 @@ HYPRE_Int hypre_AMSSetAlphaPoissonMatrix(void *solver,
    ams_data -> A_Pi = A_Pi;
 
    /* Penalize the eliminated degrees of freedom */
-   hypre_ParCSRMatrixSetDiagRows(A_Pi, DBL_MAX);
+   hypre_ParCSRMatrixSetDiagRows(A_Pi, HYPRE_REAL_MAX);
 
    /* Make sure that the first entry in each row is the diagonal one. */
    /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_Pi)); */
@@ -1106,7 +1128,7 @@ HYPRE_Int hypre_AMSSetBetaPoissonMatrix(void *solver,
    else
    {
       /* Penalize the eliminated degrees of freedom */
-      hypre_ParCSRMatrixSetDiagRows(A_G, DBL_MAX);
+      hypre_ParCSRMatrixSetDiagRows(A_G, HYPRE_REAL_MAX);
 
       /* Make sure that the first entry in each row is the diagonal one. */
       /* hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(A_G)); */
diff --git a/src/parcsr_ls/gen_redcs_mat.c b/src/parcsr_ls/gen_redcs_mat.c
index d058a63..7995c11 100644
--- a/src/parcsr_ls/gen_redcs_mat.c
+++ b/src/parcsr_ls/gen_redcs_mat.c
@@ -495,7 +495,7 @@ HYPRE_Int hypre_GenerateSubComm(MPI_Comm comm, HYPRE_Int participate, MPI_Comm *
       *new_comm_ptr = new_comm;
       return 0;
    }
-   ranks = hypre_CTAlloc(HYPRE_Int, new_num_procs+2);
+   ranks = hypre_HostCTAlloc(HYPRE_Int, new_num_procs+2);
    if (new_num_procs == 1)
    {
       if (participate) my_info = my_id;
@@ -503,8 +503,9 @@ HYPRE_Int hypre_GenerateSubComm(MPI_Comm comm, HYPRE_Int participate, MPI_Comm *
    }
    else
    {
-      info = hypre_CTAlloc(HYPRE_Int, new_num_procs+2);
-      list_len = hypre_CTAlloc(HYPRE_Int, 1);
+      info = hypre_HostCTAlloc(HYPRE_Int, new_num_procs+2);
+      list_len = hypre_HostCTAlloc(HYPRE_Int, 1);
+
 
       if (participate) 
       {
@@ -522,8 +523,9 @@ HYPRE_Int hypre_GenerateSubComm(MPI_Comm comm, HYPRE_Int participate, MPI_Comm *
       hypre_MPI_Allreduce(info, ranks, list_len[0], HYPRE_MPI_INT, hypre_MPI_MERGE, comm);
 
       hypre_MPI_Op_free (&hypre_MPI_MERGE);
-      hypre_TFree(list_len);
-      hypre_TFree(info);
+
+      hypre_HostTFree(list_len);
+      hypre_HostTFree(info);
    }
    hypre_MPI_Comm_size(comm,&num_procs);
    hypre_MPI_Comm_group(comm, &orig_group);
@@ -532,7 +534,7 @@ HYPRE_Int hypre_GenerateSubComm(MPI_Comm comm, HYPRE_Int participate, MPI_Comm *
    hypre_MPI_Group_free(&new_group);
    hypre_MPI_Group_free(&orig_group);
 
-   hypre_TFree(ranks);
+   hypre_HostTFree(ranks);
    
    *new_comm_ptr = new_comm;
    
diff --git a/src/parcsr_ls/par_add_cycle.c b/src/parcsr_ls/par_add_cycle.c
index 5fb0e71..0366dfb 100644
--- a/src/parcsr_ls/par_add_cycle.c
+++ b/src/parcsr_ls/par_add_cycle.c
@@ -38,6 +38,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
    hypre_ParCSRMatrix    **P_array;
    hypre_ParCSRMatrix    **R_array;
    hypre_ParCSRMatrix    *Lambda;
+   hypre_ParCSRMatrix    *Atilde;
    hypre_ParVector    **F_array;
    hypre_ParVector    **U_array;
    hypre_ParVector    *Vtemp;
@@ -46,11 +47,12 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
    HYPRE_Int      **CF_marker_array;
 
    HYPRE_Int       num_levels;
-   HYPRE_Int       addlvl;
+   HYPRE_Int       addlvl, add_end;
    HYPRE_Int       additive;
    HYPRE_Int       mult_additive;
    HYPRE_Int       simple;
-   HYPRE_Int       i, num_rows;
+   HYPRE_Int       add_last_lvl;
+   HYPRE_Int       i, j, num_rows;
    HYPRE_Int       n_global;
    HYPRE_Int       rlx_order;
 
@@ -61,7 +63,9 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
    HYPRE_Int       fine_grid;
    HYPRE_Int       rlx_down;
    HYPRE_Int       rlx_up;
+   HYPRE_Int       rlx_coarse;
    HYPRE_Int      *grid_relax_type;
+   HYPRE_Int      *num_grid_sweeps;
    HYPRE_Real      **l1_norms;
    HYPRE_Real    alpha, beta;
    HYPRE_Real *u_data;
@@ -92,8 +96,10 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
    additive          = hypre_ParAMGDataAdditive(amg_data);
    mult_additive     = hypre_ParAMGDataMultAdditive(amg_data);
    simple            = hypre_ParAMGDataSimple(amg_data);
+   add_last_lvl      = hypre_ParAMGDataAddLastLvl(amg_data);
    grid_relax_type   = hypre_ParAMGDataGridRelaxType(amg_data);
    Lambda            = hypre_ParAMGDataLambda(amg_data);
+   Atilde            = hypre_ParAMGDataAtilde(amg_data);
    Xtilde            = hypre_ParAMGDataXtilde(amg_data);
    Rtilde            = hypre_ParAMGDataRtilde(amg_data);
    l1_norms          = hypre_ParAMGDataL1Norms(amg_data);
@@ -101,11 +107,14 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
    relax_weight      = hypre_ParAMGDataRelaxWeight(amg_data);
    omega             = hypre_ParAMGDataOmega(amg_data);
    rlx_order         = hypre_ParAMGDataRelaxOrder(amg_data);
+   num_grid_sweeps   = hypre_ParAMGDataNumGridSweeps(amg_data);
 
    /* Initialize */
 
    addlvl = hypre_max(additive, mult_additive);
    addlvl = hypre_max(addlvl, simple);
+   if (add_last_lvl == -1 ) add_end = num_levels-1;
+   else add_end = add_last_lvl;
    Solve_err_flag = 0;
 
    /*---------------------------------------------------------------------
@@ -115,6 +124,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
    /* down cycle */
    rlx_down = grid_relax_type[1];
    rlx_up = grid_relax_type[2];
+   rlx_coarse = grid_relax_type[3];
    for (level = 0; level < num_levels-1; level++)
    {
       fine_grid = level;
@@ -126,7 +136,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
 
       hypre_ParVectorSetConstantValues(U_array[coarse_grid], 0.0); 
 
-      if (level < addlvl) /* multiplicative version */
+      if (level < addlvl || level > add_end) /* multiplicative version */
       {
          /* smoothing step */
 
@@ -134,33 +144,42 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
          {
             HYPRE_Real *A_data = hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
             HYPRE_Int *A_i = hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
-            hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
             num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
+            for (j=0; j < num_grid_sweeps[1]; j++)
+            {
+             hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
-            for (i = 0; i < num_rows; i++)
+             for (i = 0; i < num_rows; i++)
                u_data[i] = relax_weight[level]*v_data[i] / A_data[A_i[i]];
+            }
          }
 
          else if (rlx_down != 18)
          {
             /*hypre_BoomerAMGRelax(A_array[fine_grid],F_array[fine_grid],NULL,rlx_down,0,*/
-            hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
-	     CF_marker_array[fine_grid], rlx_down,rlx_order,1,
-             relax_weight[fine_grid], omega[fine_grid],
-             l1_norms[level], U_array[fine_grid], Vtemp, Ztemp);
-            hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
+            for (j=0; j < num_grid_sweeps[1]; j++)
+            {
+               hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
+	           CF_marker_array[fine_grid], rlx_down,rlx_order,1,
+                   relax_weight[fine_grid], omega[fine_grid],
+                   l1_norms[level], U_array[fine_grid], Vtemp, Ztemp);
+               hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
+            }
          }
          else
          {
-            hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
             num_rows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[fine_grid]));
+            for (j=0; j < num_grid_sweeps[1]; j++)
+            {
+             hypre_ParVectorCopy(F_array[fine_grid],Vtemp);
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
-	    for (i = 0; i < num_rows; i++)
+	     for (i = 0; i < num_rows; i++)
                u_data[i] += v_data[i] / l1_norms_lvl[i];
+            }
          }
      
          alpha = -1.0;
@@ -188,7 +207,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
       }
    }
 
-   /* solve coarse grid */ 
+   /* additive smoothing and solve coarse grid */ 
    if (addlvl < num_levels)
    {
       if (simple > -1)
@@ -203,16 +222,38 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
 	    x_global[i] += D_inv[i]*r_global[i];
       }
       else
-	 hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 1.0, Xtilde);
+      {
+         if (num_grid_sweeps[1] > 1)
+         {
+            n_global = hypre_VectorSize(hypre_ParVectorLocalVector(Rtilde));
+            hypre_ParVector *Tmptilde = hypre_CTAlloc(hypre_ParVector, 1);
+            hypre_Vector *Tmptilde_local = hypre_SeqVectorCreate(n_global);   
+            hypre_SeqVectorInitialize(Tmptilde_local);
+            hypre_ParVectorLocalVector(Tmptilde) = Tmptilde_local;   
+            hypre_ParVectorOwnsData(Tmptilde) = 1;
+            hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 0.0, Tmptilde);
+            hypre_ParVectorScale(2.0,Rtilde);
+            hypre_ParCSRMatrixMatvec(-1.0, Atilde, Tmptilde, 1.0, Rtilde);
+            hypre_ParVectorDestroy(Tmptilde);
+         }
+         hypre_ParCSRMatrixMatvec(1.0, Lambda, Rtilde, 1.0, Xtilde);
+      }
       if (addlvl == 0) hypre_ParVectorCopy(Xtilde, U_array[0]);
    }
-   else
+   if (add_end < num_levels -1)
    {
       fine_grid = num_levels -1;
-      hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid],
+      for (j=0; j < num_grid_sweeps[3]; j++)
+         if (rlx_coarse == 18)
+	    hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid],
                               1, 1, l1_norms[fine_grid],
                               1.0, 1.0 ,0,0,0,0,
                               U_array[fine_grid], Vtemp, Ztemp);
+         else
+            hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
+		NULL, rlx_coarse,0,0,
+                relax_weight[fine_grid], omega[fine_grid],
+                l1_norms[fine_grid], U_array[fine_grid], Vtemp, Ztemp);
    }
 
    /* up cycle */
@@ -221,7 +262,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
       fine_grid = level - 1;
       coarse_grid = level;
 
-      if (level <= addlvl) /* multiplicative version */
+      if (level <= addlvl || level > add_end+1) /* multiplicative version */
       {
          alpha = 1.0;
          beta = 1.0;
@@ -230,7 +271,8 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
                                      beta, U_array[fine_grid]);            
          if (rlx_up != 18)
             /*hypre_BoomerAMGRelax(A_array[fine_grid],F_array[fine_grid],NULL,rlx_up,0,*/
-            hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
+            for (j=0; j < num_grid_sweeps[2]; j++)
+              hypre_BoomerAMGRelaxIF(A_array[fine_grid],F_array[fine_grid],
 		CF_marker_array[fine_grid],
 		rlx_up,rlx_order,2,
                 relax_weight[fine_grid], omega[fine_grid],
@@ -240,6 +282,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
             HYPRE_Int loc_relax_points[2];
             loc_relax_points[0] = -1;
             loc_relax_points[1] = 1;
+            for (j=0; j < num_grid_sweeps[2]; j++)
             for (i=0; i < 2; i++)
                 hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid],F_array[fine_grid],
                                             CF_marker_array[fine_grid],
@@ -248,6 +291,7 @@ hypre_BoomerAMGAdditiveCycle( void              *amg_vdata)
                                             U_array[fine_grid], Vtemp);
          }
          else 
+            for (j=0; j < num_grid_sweeps[2]; j++)
             hypre_ParCSRRelax(A_array[fine_grid], F_array[fine_grid],
                                  1, 1, l1_norms[fine_grid],
                                  1.0, 1.0 ,0,0,0,0,
@@ -281,6 +325,11 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    hypre_ParCSRMatrix *Lambda;
    hypre_CSRMatrix *L_diag;
    hypre_CSRMatrix *L_offd;
+   hypre_ParCSRMatrix *Atilde;
+   hypre_CSRMatrix *Atilde_diag;
+   hypre_CSRMatrix *Atilde_offd;
+   HYPRE_Real    *Atilde_diag_data;
+   HYPRE_Real    *Atilde_offd_data;
    hypre_CSRMatrix *A_tmp_diag;
    hypre_CSRMatrix *A_tmp_offd;
    hypre_ParVector *Xtilde;
@@ -305,6 +354,10 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    HYPRE_Int *L_diag_j;
    HYPRE_Int *L_offd_i;
    HYPRE_Int *L_offd_j;
+   HYPRE_Int *Atilde_diag_i;
+   HYPRE_Int *Atilde_diag_j;
+   HYPRE_Int *Atilde_offd_i;
+   HYPRE_Int *Atilde_offd_j;
    HYPRE_Int *A_tmp_diag_i;
    HYPRE_Int *A_tmp_offd_i;
    HYPRE_Int *A_tmp_diag_j;
@@ -340,10 +393,12 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    HYPRE_Int       num_cols_offd = 0;
    HYPRE_Int       level, i, j, k;
    HYPRE_Int       this_proc, cnt, cnt_diag, cnt_offd;
+   HYPRE_Int       A_cnt_diag, A_cnt_offd;
    HYPRE_Int       cnt_recv, cnt_send, cnt_row, row_start;
    HYPRE_Int       start_diag, start_offd, indx, cnt_map;
    HYPRE_Int       start, j_indx, index, cnt_level;
    HYPRE_Int       max_sends, max_recvs;
+   HYPRE_Int       ns;
 
  /* Local variables  */ 
    HYPRE_Int       Solve_err_flag = 0;
@@ -354,6 +409,7 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    /*HYPRE_Real   *relax_weight = NULL;
    HYPRE_Int      relax_type; */
    HYPRE_Int       add_rlx;
+   HYPRE_Int       add_last_lvl, add_end;
    HYPRE_Real  add_rlx_wt;
 
    /* Acquire data and allocate storage */
@@ -363,19 +419,23 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    U_array           = hypre_ParAMGDataUArray(amg_data);
    additive          = hypre_ParAMGDataAdditive(amg_data);
    mult_additive     = hypre_ParAMGDataMultAdditive(amg_data);
+   add_last_lvl      = hypre_ParAMGDataAddLastLvl(amg_data);
    num_levels        = hypre_ParAMGDataNumLevels(amg_data);
    /*relax_weight      = hypre_ParAMGDataRelaxWeight(amg_data);
    relax_type        = hypre_ParAMGDataGridRelaxType(amg_data)[1];*/
    comm              = hypre_ParCSRMatrixComm(A_array[0]);
    add_rlx           = hypre_ParAMGDataAddRelaxType(amg_data);
    add_rlx_wt        = hypre_ParAMGDataAddRelaxWt(amg_data);
+   ns                = hypre_ParAMGDataNumGridSweeps(amg_data)[1];
 
    hypre_MPI_Comm_size(comm,&num_procs);
 
    l1_norms_ptr      = hypre_ParAMGDataL1Norms(amg_data); 
 
    addlvl = hypre_max(additive, mult_additive);
-   num_add_lvls = num_levels+1-addlvl;
+   if (add_last_lvl != -1) add_end = add_last_lvl+1;
+   else add_end = num_levels;
+   num_add_lvls = add_end+1-addlvl;
 
    level_start = hypre_CTAlloc(HYPRE_Int, num_add_lvls+1);
    send_data_L = 0;
@@ -387,7 +447,7 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    cnt = 1;
    max_sends = 0;
    max_recvs = 0;
-   for (i=addlvl; i < num_levels; i++)
+   for (i=addlvl; i < add_end; i++)
    {
       A_tmp = A_array[i];
       A_tmp_diag = hypre_ParCSRMatrixDiag(A_tmp);
@@ -426,7 +486,7 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    {
       if (max_sends < num_procs && max_recvs < num_procs)
       {
-         for (i=addlvl; i < num_levels; i++)
+         for (i=addlvl; i < add_end; i++)
          {
             A_tmp = A_array[i];
             comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp);
@@ -482,7 +542,7 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
          L_recv_ptr = hypre_CTAlloc(HYPRE_Int, num_recvs_L+1);
          L_send_ptr = hypre_CTAlloc(HYPRE_Int, num_sends_L+1);
 
-         for (i=addlvl; i < num_levels; i++)
+         for (i=addlvl; i < add_end; i++)
          {
             A_tmp = A_array[i];
             comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp);
@@ -524,7 +584,7 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
       {
          num_recvs_L = 0;
          num_sends_L = 0;
-         for (i=addlvl; i < num_levels; i++)
+         for (i=addlvl; i < add_end; i++)
          {
             A_tmp = A_array[i];
             comm_pkg = hypre_ParCSRMatrixCommPkg(A_tmp);
@@ -591,6 +651,7 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    L_offd = hypre_CSRMatrixCreate(num_rows_L, num_cols_offd_L, num_nonzeros_offd);
    hypre_CSRMatrixInitialize(L_diag);
    hypre_CSRMatrixInitialize(L_offd);
+
    if (num_nonzeros_diag)
    {
       L_diag_data = hypre_CSRMatrixData(L_diag);
@@ -604,6 +665,26 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    }
    L_offd_i = hypre_CSRMatrixI(L_offd);
 
+   if (ns > 1)  
+   {
+      Atilde_diag = hypre_CSRMatrixCreate(num_rows_L, num_rows_L, num_nonzeros_diag);
+      Atilde_offd = hypre_CSRMatrixCreate(num_rows_L, num_cols_offd_L, num_nonzeros_offd);
+      hypre_CSRMatrixInitialize(Atilde_diag);
+      hypre_CSRMatrixInitialize(Atilde_offd);
+      if (num_nonzeros_diag)
+      {
+         Atilde_diag_data = hypre_CSRMatrixData(Atilde_diag);
+         Atilde_diag_j = hypre_CSRMatrixJ(Atilde_diag);
+      }
+      Atilde_diag_i = hypre_CSRMatrixI(Atilde_diag);
+      if (num_nonzeros_offd)
+      {
+         Atilde_offd_data = hypre_CSRMatrixData(Atilde_offd);
+         Atilde_offd_j = hypre_CSRMatrixJ(Atilde_offd);
+      }
+      Atilde_offd_i = hypre_CSRMatrixI(Atilde_offd);
+   }
+
    if (num_rows_L) D_data = hypre_CTAlloc(HYPRE_Real,num_rows_L);
    if (send_data_L)
    {
@@ -639,7 +720,14 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    cnt_row = 1; 
    L_diag_i[0] = 0;
    L_offd_i[0] = 0;
-   for (level=addlvl; level < num_levels; level++)
+   if (ns > 1) 
+   {
+      A_cnt_diag = 0; 
+      A_cnt_offd = 0; 
+      Atilde_diag_i[0] = 0;
+      Atilde_offd_i[0] = 0;
+   }
+   for (level=addlvl; level < add_end; level++)
    {
       row_start = level_start[cnt_level];
       if (level != 0)
@@ -716,11 +804,17 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
 #pragma omp for private(i) HYPRE_SMP_SCHEDULE
 #endif
          for (i=0; i < num_rows_tmp; i++)
-        {
+         {
            D_data[i] = add_rlx_wt/A_tmp_diag_data[A_tmp_diag_i[i]];
            L_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
            L_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
-        }
+         }
+         if (ns > 1)
+           for (i=0; i < num_rows_tmp; i++)
+           {
+             Atilde_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
+             Atilde_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
+           }
       }
       else
       {
@@ -734,6 +828,12 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
            L_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
            L_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
         }
+        if (ns > 1)
+          for (i=0; i < num_rows_tmp; i++)
+          {
+            Atilde_diag_i[cnt_row+i] = start_diag + A_tmp_diag_i[i+1];
+            Atilde_offd_i[cnt_row+i] = start_offd + A_tmp_offd_i[i+1];
+          }
       }
  
       if (num_procs > 1)
@@ -754,6 +854,11 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
       for (i = 0; i < num_rows_tmp; i++)
       {
          j_indx = A_tmp_diag_i[i];
+         if (ns > 1)
+         {
+            Atilde_diag_data[A_cnt_diag] = A_tmp_diag_data[j_indx];
+            Atilde_diag_j[A_cnt_diag++] = i+row_start;
+         }
          L_diag_data[cnt_diag] = (2.0 - A_tmp_diag_data[j_indx]*D_data[i])*D_data[i];
          L_diag_j[cnt_diag++] = i+row_start;
          for (j=A_tmp_diag_i[i]+1; j < A_tmp_diag_i[i+1]; j++)
@@ -768,6 +873,21 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
              L_offd_data[cnt_offd] = (- A_tmp_offd_data[j]*D_data_offd[j_indx])*D_data[i];
              L_offd_j[cnt_offd++] = remap[j_indx];
          }
+         if (ns > 1)
+         {
+            for (j=A_tmp_diag_i[i]+1; j < A_tmp_diag_i[i+1]; j++)
+            {
+               j_indx = A_tmp_diag_j[j];
+               Atilde_diag_data[A_cnt_diag] = A_tmp_diag_data[j];
+               Atilde_diag_j[A_cnt_diag++] = j_indx+row_start;
+            }
+            for (j=A_tmp_offd_i[i]; j < A_tmp_offd_i[i+1]; j++)
+            {
+               j_indx = A_tmp_offd_j[j];
+               Atilde_offd_data[A_cnt_offd] = A_tmp_offd_data[j];
+               Atilde_offd_j[A_cnt_offd++] = remap[j_indx];
+            }
+         }
       }
       cnt_row += num_rows_tmp;
    }
@@ -809,6 +929,52 @@ HYPRE_Int hypre_CreateLambda(void *amg_vdata)
    hypre_ParCSRMatrixComm(Lambda) = comm;
    hypre_ParCSRMatrixOwnsData(Lambda) = 1;
 
+   if (ns > 1)
+   {
+      /*hypre_ParCSRCommPkg *A_comm_pkg = NULL;
+      HYPRE_Int *A_recv_ptr = NULL;
+      HYPRE_Int *A_send_ptr = NULL;
+      HYPRE_Int *A_recv_procs = NULL;
+      HYPRE_Int *A_send_procs = NULL;
+      HYPRE_Int *A_send_map_elmts = NULL;
+
+      A_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg,1);
+
+      A_recv_ptr = hypre_CTAlloc(HYPRE_Int, num_recvs+1);
+      A_send_ptr = hypre_CTAlloc(HYPRE_Int, num_sends+1);
+      A_recv_procs = hypre_CTAlloc(HYPRE_Int, num_recvs_L);
+      A_send_procs = hypre_CTAlloc(HYPRE_Int, num_sends_L);
+      A_send_map_elmts = hypre_CTAlloc(HYPRE_Int, L_send_ptr[num_sends_L]);
+
+      for (i=0; i<num_recvs_L+1; i++)
+	 A_recv_ptr[i] = L_recv_ptr[i];
+      for (i=0; i<num_sends_L+1; i++)
+	 A_send_ptr[i] = L_send_ptr[i];
+      for (i=0; i<num_recvs_L; i++)
+	 A_recv_procs[i] = L_recv_procs[i];
+      for (i=0; i<num_sends_L; i++)
+	 A_send_procs[i] = L_send_procs[i];
+      for (i=0; i < L_send_ptr[num_sends_L]; i++)
+	 A_send_map_elmts[i] = L_send_map_elmts[i];
+    
+      hypre_ParCSRCommPkgNumRecvs(A_comm_pkg) = num_recvs_L;
+      hypre_ParCSRCommPkgNumSends(A_comm_pkg) = num_sends_L;
+      hypre_ParCSRCommPkgRecvProcs(A_comm_pkg) = A_recv_procs;
+      hypre_ParCSRCommPkgSendProcs(A_comm_pkg) = A_send_procs;
+      hypre_ParCSRCommPkgRecvVecStarts(A_comm_pkg) = A_recv_ptr;
+      hypre_ParCSRCommPkgSendMapStarts(A_comm_pkg) = A_send_ptr;
+      hypre_ParCSRCommPkgSendMapElmts(A_comm_pkg) = A_send_map_elmts;
+      hypre_ParCSRCommPkgComm(A_comm_pkg) = comm; */
+
+      Atilde = hypre_CTAlloc(hypre_ParCSRMatrix, 1);
+      hypre_ParCSRMatrixDiag(Atilde) = Atilde_diag;
+      hypre_ParCSRMatrixOffd(Atilde) = Atilde_offd;
+      hypre_ParCSRMatrixCommPkg(Atilde) = L_comm_pkg;
+      hypre_ParCSRMatrixComm(Atilde) = comm;
+      hypre_ParCSRMatrixOwnsData(Atilde) = 1;
+      hypre_ParAMGDataAtilde(amg_data) = Atilde;
+   }
+
    hypre_ParAMGDataLambda(amg_data) = Lambda;
    hypre_ParAMGDataRtilde(amg_data) = Rtilde;
    hypre_ParAMGDataXtilde(amg_data) = Xtilde;
@@ -852,6 +1018,7 @@ HYPRE_Int hypre_CreateDinv(void *amg_vdata)
    HYPRE_Int       level, i;
    HYPRE_Int       add_rlx;
    HYPRE_Real      add_rlx_wt;
+   HYPRE_Int       add_last_lvl, add_end;
 
  /* Local variables  */ 
    HYPRE_Int       Solve_err_flag = 0;
@@ -869,14 +1036,17 @@ HYPRE_Int hypre_CreateDinv(void *amg_vdata)
    num_levels        = hypre_ParAMGDataNumLevels(amg_data);
    add_rlx_wt        = hypre_ParAMGDataAddRelaxWt(amg_data);
    add_rlx           = hypre_ParAMGDataAddRelaxType(amg_data);
+   add_last_lvl      = hypre_ParAMGDataAddLastLvl(amg_data);
    /*relax_weight      = hypre_ParAMGDataRelaxWeight(amg_data);
    relax_type        = hypre_ParAMGDataGridRelaxType(amg_data)[1];*/
 
    l1_norms_ptr      = hypre_ParAMGDataL1Norms(amg_data); 
    /* smooth_option       = hypre_ParAMGDataSmoothOption(amg_data); */
+   if (add_last_lvl == -1 ) add_end = num_levels;
+   else add_end = add_last_lvl;
 
    num_rows_L  = 0;
-   for (i=addlvl; i < num_levels; i++)
+   for (i=addlvl; i < add_end; i++)
    {
       A_tmp = A_array[i];
       A_tmp_diag = hypre_ParCSRMatrixDiag(A_tmp);
@@ -901,7 +1071,7 @@ HYPRE_Int hypre_CreateDinv(void *amg_vdata)
    D_inv = hypre_CTAlloc(HYPRE_Real, num_rows_L);
 
    l1_start = 0;
-   for (level=addlvl; level < num_levels; level++)
+   for (level=addlvl; level < add_end; level++)
    {
       if (level != 0)
       {
diff --git a/src/parcsr_ls/par_amg.c b/src/parcsr_ls/par_amg.c
index cb71ec8..68bea2b 100644
--- a/src/parcsr_ls/par_amg.c
+++ b/src/parcsr_ls/par_amg.c
@@ -97,6 +97,9 @@ hypre_BoomerAMGCreate()
    char    *euclidfile;
 
    HYPRE_Int cheby_order;
+   HYPRE_Int cheby_eig_est;
+   HYPRE_Int cheby_variant;
+   HYPRE_Int cheby_scale;
    HYPRE_Real cheby_eig_ratio;
 
    HYPRE_Int block_mode;
@@ -104,6 +107,7 @@ hypre_BoomerAMGCreate()
    HYPRE_Int        additive;
    HYPRE_Int        mult_additive;
    HYPRE_Int        simple;
+   HYPRE_Int        add_last_lvl;
    HYPRE_Real   add_trunc_factor;
    HYPRE_Int      add_P_max_elmts;
    HYPRE_Int      add_rlx_type;
@@ -198,6 +202,9 @@ hypre_BoomerAMGCreate()
    outer_wt = 1.0;
 
    cheby_order = 2;
+   cheby_variant = 0;
+   cheby_scale = 1;
+   cheby_eig_est = 10;
    cheby_eig_ratio = .3;
 
    block_mode = 0;
@@ -205,6 +212,7 @@ hypre_BoomerAMGCreate()
    additive = -1;
    mult_additive = -1;
    simple = -1;
+   add_last_lvl = -1;
    add_trunc_factor = 0.0;
    add_P_max_elmts = 0;
    add_rlx_type = 18;
@@ -301,6 +309,9 @@ hypre_BoomerAMGCreate()
 
    hypre_BoomerAMGSetChebyOrder(amg_data, cheby_order);
    hypre_BoomerAMGSetChebyFraction(amg_data, cheby_eig_ratio);
+   hypre_BoomerAMGSetChebyEigEst(amg_data, cheby_eig_est);
+   hypre_BoomerAMGSetChebyVariant(amg_data, cheby_variant);
+   hypre_BoomerAMGSetChebyScale(amg_data, cheby_scale);
 
    hypre_BoomerAMGSetNumIterations(amg_data, num_iterations);
 
@@ -311,6 +322,7 @@ hypre_BoomerAMGCreate()
    hypre_BoomerAMGSetMultAddTruncFactor(amg_data, add_trunc_factor);
    hypre_BoomerAMGSetAddRelaxType(amg_data, add_rlx_type);
    hypre_BoomerAMGSetAddRelaxWt(amg_data, add_rlx_wt);
+   hypre_ParAMGDataAddLastLvl(amg_data) = add_last_lvl;
    hypre_ParAMGDataLambda(amg_data) = NULL;
    hypre_ParAMGDataXtilde(amg_data) = NULL;
    hypre_ParAMGDataRtilde(amg_data) = NULL;
@@ -353,8 +365,11 @@ hypre_BoomerAMGCreate()
    /* this can not be set by the user currently */
    hypre_ParAMGDataBlockMode(amg_data) = block_mode;
 
+   /* Stuff for Chebyshev smoothing */
    hypre_ParAMGDataMaxEigEst(amg_data) = NULL;
    hypre_ParAMGDataMinEigEst(amg_data) = NULL;
+   hypre_ParAMGDataChebyDS(amg_data) = NULL;
+   hypre_ParAMGDataChebyCoefs(amg_data) = NULL;
 
    /* BM Oct 22, 2006 */
    hypre_ParAMGDataPlotGrids(amg_data) = 0;
@@ -503,6 +518,14 @@ hypre_BoomerAMGDestroy( void *data )
    if (hypre_ParAMGDataLambda(amg_data))
       hypre_ParCSRMatrixDestroy(hypre_ParAMGDataLambda(amg_data));
 
+   if (hypre_ParAMGDataAtilde(amg_data))
+   {
+      hypre_ParCSRMatrix *Atilde = hypre_ParAMGDataAtilde(amg_data);
+      hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(Atilde));
+      hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(Atilde));
+      hypre_TFree (Atilde);
+   }
+
    if (hypre_ParAMGDataXtilde(amg_data))
       hypre_ParVectorDestroy(hypre_ParAMGDataXtilde(amg_data));
 
@@ -517,6 +540,22 @@ hypre_BoomerAMGDestroy( void *data )
       hypre_TFree(hypre_ParAMGDataL1Norms(amg_data));
    }
 
+   if (hypre_ParAMGDataChebyCoefs(amg_data))
+   {
+      for (i=0; i < num_levels; i++)
+         if (hypre_ParAMGDataChebyCoefs(amg_data)[i])
+           hypre_TFree(hypre_ParAMGDataChebyCoefs(amg_data)[i]);
+      hypre_TFree(hypre_ParAMGDataChebyCoefs(amg_data));
+   }
+
+   if (hypre_ParAMGDataChebyDS(amg_data))
+   {
+      for (i=0; i < num_levels; i++)
+         if (hypre_ParAMGDataChebyDS(amg_data)[i])
+           hypre_TFree(hypre_ParAMGDataChebyDS(amg_data)[i]);
+      hypre_TFree(hypre_ParAMGDataChebyDS(amg_data));
+   }
+
    if (hypre_ParAMGDataDinv(amg_data))
       hypre_TFree(hypre_ParAMGDataDinv(amg_data));
 
@@ -3464,6 +3503,57 @@ hypre_BoomerAMGSetChebyFraction( void     *data,
 
    return hypre_error_flag;
 }
+HYPRE_Int
+hypre_BoomerAMGSetChebyEigEst( void     *data,
+                              HYPRE_Int     cheby_eig_est)
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+ 
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   } 
+   if (cheby_eig_est < 0)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   } 
+   hypre_ParAMGDataChebyEigEst(amg_data) = cheby_eig_est;
+
+   return hypre_error_flag;
+}
+HYPRE_Int
+hypre_BoomerAMGSetChebyVariant( void     *data,
+                              HYPRE_Int     cheby_variant)
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+ 
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   } 
+   hypre_ParAMGDataChebyVariant(amg_data) = cheby_variant;
+
+   return hypre_error_flag;
+}
+HYPRE_Int
+hypre_BoomerAMGSetChebyScale( void     *data,
+                              HYPRE_Int     cheby_scale)
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+ 
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   } 
+   hypre_ParAMGDataChebyScale(amg_data) = cheby_scale;
+
+   return hypre_error_flag;
+}
+
 
 /*--------------------------------------------------------------------------
  * hypre_BoomerAMGSetInterpVectors
@@ -3719,6 +3809,23 @@ hypre_BoomerAMGGetSimple( void *data,
 }
 
 HYPRE_Int
+hypre_BoomerAMGSetAddLastLvl( void *data,
+                          HYPRE_Int   add_last_lvl )
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   hypre_ParAMGDataAddLastLvl(amg_data) = add_last_lvl;
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int
 hypre_BoomerAMGSetNonGalerkinTol( void   *data,
                             HYPRE_Real nongalerkin_tol)
 {
diff --git a/src/parcsr_ls/par_amg.h b/src/parcsr_ls/par_amg.h
index 0c7cb23..8ef15e3 100644
--- a/src/parcsr_ls/par_amg.h
+++ b/src/parcsr_ls/par_amg.h
@@ -105,7 +105,6 @@ typedef struct
    HYPRE_Int                  num_levels;
    HYPRE_Real         **l1_norms;
 
-
    /* Block data */
    hypre_ParCSRBlockMatrix **A_block_array;
    hypre_ParCSRBlockMatrix **P_block_array;
@@ -136,8 +135,13 @@ typedef struct
 
    HYPRE_Real          *max_eig_est;
    HYPRE_Real          *min_eig_est;
-   HYPRE_Int                  cheby_order;
+   HYPRE_Int            cheby_eig_est;
+   HYPRE_Int            cheby_order;
+   HYPRE_Int            cheby_variant;
+   HYPRE_Int            cheby_scale;
    HYPRE_Real           cheby_fraction;
+   HYPRE_Real         **cheby_ds;
+   HYPRE_Real         **cheby_coefs;
 
    /* data needed for non-Galerkin option */
    HYPRE_Int           nongalerk_num_tol;
@@ -209,11 +213,13 @@ typedef struct
    HYPRE_Int      additive;
    HYPRE_Int      mult_additive;
    HYPRE_Int      simple;
+   HYPRE_Int      add_last_lvl;
    HYPRE_Int      add_P_max_elmts;
    HYPRE_Real     add_trunc_factor;
    HYPRE_Int      add_rlx_type;
    HYPRE_Real     add_rlx_wt;
    hypre_ParCSRMatrix *Lambda;
+   hypre_ParCSRMatrix *Atilde;
    hypre_ParVector *Rtilde;
    hypre_ParVector *Xtilde;
    HYPRE_Real *D_inv;
@@ -332,9 +338,13 @@ typedef struct
 
 #define hypre_ParAMGDataMaxEigEst(amg_data) ((amg_data)->max_eig_est)	
 #define hypre_ParAMGDataMinEigEst(amg_data) ((amg_data)->min_eig_est)	
+#define hypre_ParAMGDataChebyEigEst(amg_data) ((amg_data)->cheby_eig_est)
+#define hypre_ParAMGDataChebyVariant(amg_data) ((amg_data)->cheby_variant)
+#define hypre_ParAMGDataChebyScale(amg_data) ((amg_data)->cheby_scale)
 #define hypre_ParAMGDataChebyOrder(amg_data) ((amg_data)->cheby_order)
 #define hypre_ParAMGDataChebyFraction(amg_data) ((amg_data)->cheby_fraction)
-
+#define hypre_ParAMGDataChebyDS(amg_data) ((amg_data)->cheby_ds)
+#define hypre_ParAMGDataChebyCoefs(amg_data) ((amg_data)->cheby_coefs)
 
 /* block */
 #define hypre_ParAMGDataABlockArray(amg_data) ((amg_data)->A_block_array)
@@ -408,11 +418,13 @@ typedef struct
 #define hypre_ParAMGDataAdditive(amg_data) ((amg_data)->additive)
 #define hypre_ParAMGDataMultAdditive(amg_data) ((amg_data)->mult_additive)
 #define hypre_ParAMGDataSimple(amg_data) ((amg_data)->simple)
+#define hypre_ParAMGDataAddLastLvl(amg_data) ((amg_data)->add_last_lvl)
 #define hypre_ParAMGDataMultAddPMaxElmts(amg_data) ((amg_data)->add_P_max_elmts)
 #define hypre_ParAMGDataMultAddTruncFactor(amg_data) ((amg_data)->add_trunc_factor)
 #define hypre_ParAMGDataAddRelaxType(amg_data) ((amg_data)->add_rlx_type)
 #define hypre_ParAMGDataAddRelaxWt(amg_data) ((amg_data)->add_rlx_wt)
 #define hypre_ParAMGDataLambda(amg_data) ((amg_data)->Lambda)
+#define hypre_ParAMGDataAtilde(amg_data) ((amg_data)->Atilde)
 #define hypre_ParAMGDataRtilde(amg_data) ((amg_data)->Rtilde)
 #define hypre_ParAMGDataXtilde(amg_data) ((amg_data)->Xtilde)
 #define hypre_ParAMGDataDinv(amg_data) ((amg_data)->D_inv)
diff --git a/src/parcsr_ls/par_amg_setup.c b/src/parcsr_ls/par_amg_setup.c
index 15b71b8..32241b0 100644
--- a/src/parcsr_ls/par_amg_setup.c
+++ b/src/parcsr_ls/par_amg_setup.c
@@ -79,10 +79,11 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    HYPRE_Int      mult_additive = hypre_ParAMGDataMultAdditive(amg_data);
    HYPRE_Int      additive = hypre_ParAMGDataAdditive(amg_data);
    HYPRE_Int      simple = hypre_ParAMGDataSimple(amg_data);
+   HYPRE_Int      add_last_lvl = hypre_ParAMGDataAddLastLvl(amg_data);
    HYPRE_Int      add_P_max_elmts = hypre_ParAMGDataMultAddPMaxElmts(amg_data);
    HYPRE_Real     add_trunc_factor = hypre_ParAMGDataMultAddTruncFactor(amg_data);
    HYPRE_Int      add_rlx = hypre_ParAMGDataAddRelaxType(amg_data);
-   HYPRE_Real      add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data);
+   HYPRE_Real     add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data);
 
    hypre_ParCSRBlockMatrix **A_block_array, **P_block_array;
  
@@ -99,8 +100,11 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    hypre_ParCSRMatrix  *AN = NULL;
    hypre_ParCSRMatrix  *P1;
    hypre_ParCSRMatrix  *P2;
+   hypre_ParCSRMatrix  *Pnew = NULL;
    HYPRE_Real          *SmoothVecs = NULL;
    HYPRE_Real         **l1_norms = NULL;
+   HYPRE_Real         **cheby_ds = NULL;
+   HYPRE_Real         **cheby_coefs = NULL;
 
    HYPRE_Int       old_num_levels, num_levels;
    HYPRE_Int       level;
@@ -183,7 +187,23 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    HYPRE_Int rap2 = hypre_ParAMGDataRAP2(amg_data);
    HYPRE_Int keepTranspose = hypre_ParAMGDataKeepTranspose(amg_data);
 
+   HYPRE_Int *num_grid_sweeps = hypre_ParAMGDataNumGridSweeps(amg_data);
+   HYPRE_Int ns = num_grid_sweeps[1];
    HYPRE_Real    wall_time;   /* for debugging instrumentation */
+   HYPRE_Int      add_end;
+
+#ifdef HYPRE_USE_GPU
+   if (!hypre_ParCSRMatrixIsManaged(A)){
+     hypre_fprintf(stderr,"ERROR:: INVALID A in hypre_BoomerAMGSetup::Address %p\n",A);
+     //exit(2);
+   } else if(!hypre_ParVectorIsManaged(f)){
+     hypre_fprintf(stderr,"ERROR:: INVALID f in hypre_BoomerAMGSetup::Address %p\n",f);
+     //exit(2);
+   } else if (!hypre_ParVectorIsManaged(u)){
+     hypre_fprintf(stderr,"ERROR:: INVALID u in hypre_BoomerAMGSetup::Address %p\n",u);
+     //exit(2);
+   } 
+#endif
 
    /*hypre_CSRMatrix *A_new;*/
 
@@ -196,6 +216,8 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    hypre_CSRMatrixPrint(A_new, "Atestnew"); */
    old_num_levels = hypre_ParAMGDataNumLevels(amg_data);
    max_levels = hypre_ParAMGDataMaxLevels(amg_data);
+   add_end = hypre_min(add_last_lvl, max_levels-1);
+   if (add_end == -1) add_end = max_levels-1;
    amg_logging = hypre_ParAMGDataLogging(amg_data);
    amg_print_level = hypre_ParAMGDataPrintLevel(amg_data);
    coarsen_type = hypre_ParAMGDataCoarsenType(amg_data);
@@ -745,7 +767,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    while (not_finished_coarsening)
    {
 
-
       /* only do nodal coarsening on a fixed number of levels */
       if (level >= nodal_levels)
       {
@@ -1144,7 +1165,8 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
 	    if (S) hypre_ParCSRMatrixDestroy(S);
 	    if (SN) hypre_ParCSRMatrixDestroy(SN);
 	    if (AN) hypre_ParCSRMatrixDestroy(AN);
-            hypre_TFree(CF_marker);
+	    if (num_functions > 1) hypre_TFree(coarse_dof_func);
+	    hypre_TFree(CF_marker);
             hypre_TFree(coarse_pnts_global);
             if (level > 0)
             {
@@ -1389,7 +1411,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
 	    fflush(NULL);
          }
 
-
             if (debug_flag==1) wall_time = time_getWallclockSeconds();
 
             if (interp_type == 4) 
@@ -1895,11 +1916,9 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
 
       if (!block_mode)
       {
-         if (mult_addlvl > -1 && level >= mult_addlvl)
+         if (mult_addlvl > -1 && level >= mult_addlvl && level <= add_end)
          {
             HYPRE_Real *d_diag;
-            hypre_ParCSRMatrix *Q = NULL;
-            Q = hypre_ParMatmul(A_array[level],P);
             if (add_rlx == 0)
             {
                hypre_CSRMatrix *lvl_Adiag = hypre_ParCSRMatrixDiag(A_array[level]);
@@ -1917,54 +1936,113 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
                if (num_threads == 1) 
 		  hypre_ParCSRComputeL1Norms(A_array[level], 1, NULL, &d_diag);
                else 
-                  hypre_ParCSRComputeL1NormsThreads(A_array[level], 1, num_threads, NULL, &d_diag);
+                  hypre_ParCSRComputeL1NormsThreads(A_array[level], 1, 
+			num_threads, NULL, &d_diag);
             }
-            hypre_ParCSRMatrixAminvDB(P,Q,d_diag,&P_array[level]);
-            A_H = hypre_ParTMatmul(P,Q);
-            hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H);
-            hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
-            hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
-            hypre_ParCSRMatrixOwnsColStarts(P) = 0; 
-            if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); 
-            /*hypre_ParCSRMatrixDestroy(P); */
-            hypre_TFree(d_diag); 
-	    /*hypre_BoomerAMGBuildCoarseOperator(P, A_array[level] , P, &A_H); 
-            hypre_ParCSRMatrix *C = NULL;
-            HYPRE_Int *num_grid_sweeps
-                        = hypre_ParAMGDataNumGridSweeps(amg_data);
-            if (grid_relax_type[1] == 18)
-		C = hypre_CreateC(A_array[level], 0.0);
-            else
-		C = hypre_CreateC(A_array[level], relax_weight[level]);
-            if (num_grid_sweeps[1] > 1)
+            if (ns == 1)
             {
-                  hypre_ParCSRMatrix *Pnew = NULL;
-                  Pnew = hypre_ParMatmul(C,P);
-                  P_array[level] = hypre_ParMatmul(C,Pnew);
-                  hypre_ParCSRMatrixDestroy(Pnew);
+               hypre_ParCSRMatrix *Q = NULL;
+               Q = hypre_ParMatmul(A_array[level],P);
+               hypre_ParCSRMatrixAminvDB(P,Q,d_diag,&P_array[level]);
+               A_H = hypre_ParTMatmul(P,Q);
+               hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H);
+               hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
+               hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
+               hypre_ParCSRMatrixOwnsColStarts(P) = 0; 
+               if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H); 
+               /*hypre_ParCSRMatrixDestroy(P); */
+               hypre_TFree(d_diag); 
+               /* Set NonGalerkin drop tol on each level */
+               if (level < nongalerk_num_tol) nongalerk_tol_l = nongalerk_tol[level];
+               if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level];
+               if (nongalerk_tol_l > 0.0)
+               {
+               /* Build Non-Galerkin Coarse Grid */
+                  hypre_ParCSRMatrix *Q = NULL;
+                  hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q,
+                    0.333*strong_threshold, max_row_sum, num_functions, 
+                    dof_func_array[level+1], S_commpkg_switch, CF_marker_array[level], 
+                    /* nongalerk_tol, sym_collapse, lump_percent, beta );*/
+                      nongalerk_tol_l,      1,            0.5,    1.0 );
+            
+                  hypre_ParCSRMatrixColStarts(P_array[level]) = hypre_ParCSRMatrixRowStarts(A_H);
+                  if (!hypre_ParCSRMatrixCommPkg(A_H))
+                     hypre_MatvecCommPkgCreate(A_H);
+               }
+               hypre_ParCSRMatrixDestroy(Q);
+			
             }
-            else
-                  P_array[level] = hypre_ParMatmul(C,P);
-            hypre_ParCSRMatrixDestroy(C); */
-
-            /* Set NonGalerkin drop tol on each level */
-            if (level < nongalerk_num_tol) nongalerk_tol_l = nongalerk_tol[level];
-            if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level];
-            if (nongalerk_tol_l > 0.0)
+            else 
             {
-            /* Build Non-Galerkin Coarse Grid */
-               hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q,
+               HYPRE_Int ns_tmp = ns;
+               hypre_ParCSRMatrix *C = NULL;
+               hypre_ParCSRMatrix *Ptmp = NULL;
+               /* Set NonGalerkin drop tol on each level */
+               if (level < nongalerk_num_tol)
+                   nongalerk_tol_l = nongalerk_tol[level];
+               if (nongal_tol_array) nongalerk_tol_l = nongal_tol_array[level];
+
+               if (nongalerk_tol_l > 0.0)
+               {
+                  /* Construct AP, and then RAP */
+                  hypre_ParCSRMatrix *Q = NULL;
+                  Q = hypre_ParMatmul(A_array[level],P_array[level]);
+                  A_H = hypre_ParTMatmul(P_array[level],Q);
+                  hypre_ParCSRMatrixRowStarts(A_H) = hypre_ParCSRMatrixColStarts(A_H);
+                  hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
+                  hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
+                  hypre_ParCSRMatrixOwnsColStarts(P_array[level]) = 0;
+                  if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H);
+            
+                  /* Build Non-Galerkin Coarse Grid */
+                  hypre_BoomerAMGBuildNonGalerkinCoarseOperator(&A_H, Q,
                     0.333*strong_threshold, max_row_sum, num_functions, 
                     dof_func_array[level+1], S_commpkg_switch, CF_marker_array[level], 
                     /* nongalerk_tol, sym_collapse, lump_percent, beta );*/
                       nongalerk_tol_l,      1,            0.5,    1.0 );
             
-               hypre_ParCSRMatrixColStarts(P_array[level]) = hypre_ParCSRMatrixRowStarts(A_H);
-               if (!hypre_ParCSRMatrixCommPkg(A_H))
-                   hypre_MatvecCommPkgCreate(A_H);
-			
+                  if (!hypre_ParCSRMatrixCommPkg(A_H))
+                     hypre_MatvecCommPkgCreate(A_H);
+            
+                  /* Delete AP */
+                  hypre_ParCSRMatrixDestroy(Q);
+               }
+               else if (rap2)
+               {
+                  /* Use two matrix products to generate A_H */
+                  hypre_ParCSRMatrix *Q = NULL;
+                  Q = hypre_ParMatmul(A_array[level],P_array[level]);
+                  A_H = hypre_ParTMatmul(P_array[level],Q);
+                  hypre_ParCSRMatrixOwnsRowStarts(A_H) = 1;
+                  hypre_ParCSRMatrixOwnsColStarts(A_H) = 0;
+                  hypre_ParCSRMatrixOwnsColStarts(P_array[level]) = 0;
+                  if (num_procs > 1) hypre_MatvecCommPkgCreate(A_H);
+                  /* Delete AP */
+                  hypre_ParCSRMatrixDestroy(Q);
+               }
+               else
+	          hypre_BoomerAMGBuildCoarseOperatorKT(P, A_array[level] , P, 
+			keepTranspose, &A_H); 
+	
+               if (add_rlx == 18)
+	          C = hypre_CreateC(A_array[level], 0.0);
+               else
+		  C = hypre_CreateC(A_array[level], add_rlx_wt);
+               Ptmp = P;
+	       while (ns_tmp > 0)
+               {
+                  Pnew = Ptmp;
+                  Ptmp = NULL;
+		  Ptmp = hypre_ParMatmul(C,Pnew);
+                  if (ns_tmp < ns)
+			hypre_ParCSRMatrixDestroy(Pnew);
+		  ns_tmp--;
+               }
+               Pnew = Ptmp;
+               P_array[level] = Pnew;
+               hypre_ParCSRMatrixDestroy(C); 
             }
-            hypre_ParCSRMatrixDestroy(Q);
+
 
 
             if (add_P_max_elmts || add_trunc_factor)
@@ -2012,7 +2090,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
          A_block_array[level+1] = A_H_block;
 
       }
-      else if (mult_addlvl == -1 || level < mult_addlvl)
+      else if (mult_addlvl == -1 || level < mult_addlvl || level > add_end)
       {
          /* Set NonGalerkin drop tol on each level */
          if (level < nongalerk_num_tol)
@@ -2062,6 +2140,12 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
             /* Compute standard Galerkin coarse-grid product */
             hypre_BoomerAMGBuildCoarseOperatorKT(P_array[level], A_array[level] , 
                                         P_array[level], keepTranspose, &A_H);
+            if (Pnew && ns==1) 
+            {
+               hypre_ParCSRMatrixDestroy(P);
+               P_array[level] = Pnew;
+            }
+	
          }
 
       }
@@ -2103,6 +2187,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    if (  (seq_threshold >= coarse_threshold) && (coarse_size > coarse_threshold) && (level != max_levels-1))
    {
       hypre_seqAMGSetup( amg_data, level, coarse_threshold);
+
    }
    else if (grid_relax_type[3] == 9 || grid_relax_type[3] == 99)  /*use of Gaussian elimination on coarsest level */
    {
@@ -2153,7 +2238,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
          hypre_ParVectorSetPartitioningOwner(U_array[level],0);
       }   
    }
-   
+
    /*-----------------------------------------------------------------------
     * enter all the stuff created, A[level], P[level], CF_marker[level],
     * for levels 1 through coarsest, into amg_data data structure
@@ -2170,6 +2255,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
     *-----------------------------------------------------------------------*/
 
    if (addlvl > -1 || 
+	grid_relax_type[1] == 7 || grid_relax_type[2] == 7 || grid_relax_type[3] == 7 ||
 	grid_relax_type[1] == 8 || grid_relax_type[2] == 8 || grid_relax_type[3] == 8 ||
 	grid_relax_type[1] == 13 || grid_relax_type[2] == 13 || grid_relax_type[3] == 13 ||
 	grid_relax_type[1] == 14 || grid_relax_type[2] == 14 || grid_relax_type[3] == 14 ||
@@ -2185,6 +2271,10 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       min_eig_est = hypre_CTAlloc(HYPRE_Real, num_levels);
       hypre_ParAMGDataMaxEigEst(amg_data) = max_eig_est;
       hypre_ParAMGDataMinEigEst(amg_data) = min_eig_est;
+      cheby_ds = hypre_CTAlloc(HYPRE_Real *, num_levels);
+      cheby_coefs = hypre_CTAlloc(HYPRE_Real *, num_levels);
+      hypre_ParAMGDataChebyDS(amg_data) = cheby_ds;
+      hypre_ParAMGDataChebyCoefs(amg_data) = cheby_coefs;
    }
    if (grid_relax_type[0] == 15 ||grid_relax_type[1] == 15 ||  grid_relax_type[2] == 15 || grid_relax_type[3] == 15)
       /* CG */
@@ -2253,7 +2343,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
          }
       }
    }
-   for (j = addlvl; j < num_levels; j++)
+   for (j = addlvl; j < hypre_min(add_end+1, num_levels) ; j++)
    {
       if (add_rlx == 18 )
       {
@@ -2263,15 +2353,92 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
                hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
       }
    }
+   for (j = add_end+1; j < num_levels; j++)
+   {
+      if (num_threads == 1)
+      {
+         if (j < num_levels-1 && (grid_relax_type[1] == 8 || grid_relax_type[1] == 13 || 
+		grid_relax_type[1] == 14 || grid_relax_type[2] == 8 || grid_relax_type[2] == 13 ||
+		grid_relax_type[2] == 14)) 
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1Norms(A_array[j], 4, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]);
+         }
+         else if ((grid_relax_type[3] == 8 || grid_relax_type[3] == 13 || grid_relax_type[3] == 14) 
+		&& j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1Norms(A_array[j], 4, NULL, &l1_norms[j]);
+         }
+         if ((grid_relax_type[1] == 18 || grid_relax_type[2] == 18)  && j < num_levels-1)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1Norms(A_array[j], 1, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]);
+         }
+         else if (grid_relax_type[3] == 18 && j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1Norms(A_array[j], 1, NULL, &l1_norms[j]);
+         }
+      }
+      else
+      {
+         if (j < num_levels-1 && (grid_relax_type[1] == 8 || grid_relax_type[1] == 13 || 
+		grid_relax_type[1] == 14 || grid_relax_type[2] == 8 || grid_relax_type[2] == 13 ||
+		grid_relax_type[2] == 14)) 
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, CF_marker_array[j] , &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]);
+         }
+         else if ((grid_relax_type[3] == 8 || grid_relax_type[3] == 13 || grid_relax_type[3] == 14) 
+		&& j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1NormsThreads(A_array[j], 4, num_threads, NULL, &l1_norms[j]);
+         }
+         if ((grid_relax_type[1] == 18 || grid_relax_type[2] == 18)  && j < num_levels-1)
+         {
+            if (relax_order)
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, CF_marker_array[j], &l1_norms[j]);
+            else
+               hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
+         }
+         else if (grid_relax_type[3] == 18 && j == num_levels-1)
+         {
+            hypre_ParCSRComputeL1NormsThreads(A_array[j], 1, num_threads, NULL, &l1_norms[j]);
+         }
+      }
+   }
    for (j = 0; j < num_levels; j++)
    {
-      if (grid_relax_type[1] == 16 || grid_relax_type[2] == 16 || (grid_relax_type[3] == 16 && j== (num_levels-1)))
+      if (grid_relax_type[1] == 7 || grid_relax_type[2] == 7 || (grid_relax_type[3] == 7 && j== (num_levels-1)))
       {
-         HYPRE_Int scale = 1;
-         HYPRE_Real temp_d, temp_d2;
-         hypre_ParCSRMaxEigEstimateCG(A_array[j], scale, 10, &temp_d, &temp_d2);
-         max_eig_est[j] = temp_d;
-         min_eig_est[j] = temp_d2;
+          hypre_ParCSRComputeL1Norms(A_array[j], 5, NULL, &l1_norms[j]);
+      }
+      else if (grid_relax_type[1] == 16 || grid_relax_type[2] == 16 || (grid_relax_type[3] == 16 && j== (num_levels-1)))
+      {
+         HYPRE_Int scale = hypre_ParAMGDataChebyScale(amg_data);;
+         HYPRE_Int variant = hypre_ParAMGDataChebyVariant(amg_data);
+         HYPRE_Real max_eig, min_eig = 0;
+         HYPRE_Real *coefs = NULL;
+         HYPRE_Real *ds = NULL;
+         HYPRE_Int cheby_order = hypre_ParAMGDataChebyOrder(amg_data);
+         HYPRE_Int cheby_eig_est = hypre_ParAMGDataChebyEigEst(amg_data);
+         HYPRE_Real cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);
+         if (cheby_eig_est)
+	    hypre_ParCSRMaxEigEstimateCG(A_array[j], scale, cheby_eig_est, 
+		&max_eig, &min_eig);
+         else
+	    hypre_ParCSRMaxEigEstimate(A_array[j], scale, &max_eig);
+         max_eig_est[j] = max_eig;
+         min_eig_est[j] = min_eig;
+         hypre_ParCSRRelax_Cheby_Setup(A_array[j],max_eig, min_eig, 
+		cheby_fraction, cheby_order, scale, variant, &coefs, &ds);
+         cheby_coefs[j] = coefs;
+         cheby_ds[j] = ds;
       }
      else if (grid_relax_type[1] == 15 || (grid_relax_type[3] == 15 && j == (num_levels-1))  )
      {
diff --git a/src/parcsr_ls/par_cheby.c b/src/parcsr_ls/par_cheby.c
new file mode 100644
index 0000000..1b7e910
--- /dev/null
+++ b/src/parcsr_ls/par_cheby.c
@@ -0,0 +1,338 @@
+/******************************************************************************
+ *
+ * Chebyshev setup and solve
+ *
+ *****************************************************************************/
+
+#include "_hypre_parcsr_ls.h"
+#include "_hypre_parcsr_mv.h"
+#include "float.h"
+
+
+/******************************************************************************
+
+Chebyshev relaxation
+
+ 
+Can specify order 1-4 (this is the order of the resid polynomial)- here we
+explicitly code the coefficients (instead of
+iteratively determining)
+
+
+variant 0: standard chebyshev
+this is rlx 11 if scale = 0, and 16 if scale == 1
+
+variant 1: modified cheby: T(t)* f(t) where f(t) = (1-b/t)
+this is rlx 15 if scale = 0, and 17 if scale == 1
+
+ratio indicates the percentage of the whole spectrum to use (so .5
+means half, and .1 means 10percent)
+
+
+*******************************************************************************/
+
+HYPRE_Int hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A, /* matrix to relax with */
+                            HYPRE_Real max_eig,      
+                            HYPRE_Real min_eig,     
+                            HYPRE_Real fraction,   
+                            HYPRE_Int order,            /* polynomial order */
+                            HYPRE_Int scale,            /* scale by diagonal?*/
+                            HYPRE_Int variant,           
+                            HYPRE_Real **coefs_ptr,
+                            HYPRE_Real **ds_ptr)   /* initial/updated approximation */
+{
+   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Real     *A_diag_data  = hypre_CSRMatrixData(A_diag);
+   HYPRE_Int            *A_diag_i     = hypre_CSRMatrixI(A_diag);
+
+   HYPRE_Real theta, delta;
+   
+   HYPRE_Real den;
+   HYPRE_Real upper_bound, lower_bound;
+   
+   HYPRE_Int j;
+   HYPRE_Int num_rows = hypre_CSRMatrixNumRows(A_diag);
+ 
+   HYPRE_Real *coefs = NULL;
+   
+   HYPRE_Int cheby_order;
+
+   HYPRE_Real *ds_data = NULL;
+   HYPRE_Real  diag;
+
+   /* u = u + p(A)r */
+
+   if (order > 4)
+      order = 4;
+   if (order < 1)
+      order = 1;
+
+   coefs = hypre_CTAlloc(HYPRE_Real, order+1);
+   /* we are using the order of p(A) */
+   cheby_order = order -1;
+   
+    /* make sure we are large enough -  Adams et al. 2003 */
+   upper_bound = max_eig * 1.1;
+   /* lower_bound = max_eig/fraction; */
+   lower_bound = (upper_bound - min_eig)* fraction + min_eig; 
+
+
+   /* theta and delta */
+   theta = (upper_bound + lower_bound)/2;
+   delta = (upper_bound - lower_bound)/2;
+
+   if (variant == 1 )
+   {
+      switch ( cheby_order ) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0  - so order is
+                                one less that  resid poly: r(t) = 1 - t*s(t) */ 
+      {
+         case 0: 
+            coefs[0] = 1.0/theta;     
+            
+            break;
+            
+         case 1:  /* (del - t + 2*th)/(th^2 + del*th) */
+            den = (theta*theta + delta*theta);
+            
+            coefs[0] = (delta + 2*theta)/den;     
+            coefs[1] = -1.0/den;
+            
+            break;
+            
+         case 2:  /* (4*del*th - del^2 - t*(2*del + 6*th) + 2*t^2 + 6*th^2)/(2*del*th^2 - del^2*th - del^3 + 2*th^3)*/
+            den = 2*delta*theta*theta - delta*delta*theta - pow(delta,3) + 2*pow(theta,3);
+            
+            coefs[0] = (4*delta*theta - pow(delta,2) +  6*pow(theta,2))/den;
+            coefs[1] = -(2*delta + 6*theta)/den;
+            coefs[2] =  2/den;
+            
+            break;
+            
+         case 3: /* -(6*del^2*th - 12*del*th^2 - t^2*(4*del + 16*th) + t*(12*del*th - 3*del^2 + 24*th^2) + 3*del^3 + 4*t^3 - 16*th^3)/(4*del*th^3 - 3*del^2*th^2 - 3*del^3*th + 4*th^4)*/
+            den = - (4*delta*pow(theta,3) - 3*pow(delta,2)*pow(theta,2) - 3*pow(delta,3)*theta + 4*pow(theta,4) );
+            
+            coefs[0] = (6*pow(delta,2)*theta - 12*delta*pow(theta,2) + 3*pow(delta,3) - 16*pow(theta,3)   )/den;
+            coefs[1] = (12*delta*theta - 3*pow(delta,2) + 24*pow(theta,2))/den;
+            coefs[2] =  -( 4*delta + 16*theta)/den;
+            coefs[3] = 4/den;
+            
+            break;
+      }
+   }
+   
+   else /* standard chebyshev */
+   {
+   
+      switch ( cheby_order ) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0  - so order is
+                                one less thatn resid poly: r(t) = 1 - t*s(t) */ 
+      {
+         case 0: 
+            coefs[0] = 1.0/theta;     
+            break;
+            
+         case 1:  /* (  2*t - 4*th)/(del^2 - 2*th^2) */
+            den = delta*delta - 2*theta*theta;
+            
+            coefs[0] = -4*theta/den;     
+            coefs[1] = 2/den;   
+            
+            break;
+            
+         case 2: /* (3*del^2 - 4*t^2 + 12*t*th - 12*th^2)/(3*del^2*th - 4*th^3)*/
+            den = 3*(delta*delta)*theta - 4*(theta*theta*theta);
+            
+            coefs[0] = (3*delta*delta - 12 *theta*theta)/den;
+            coefs[1] = 12*theta/den;
+            coefs[2] = -4/den; 
+            
+            break;
+            
+         case 3: /*(t*(8*del^2 - 48*th^2) - 16*del^2*th + 32*t^2*th - 8*t^3 + 32*th^3)/(del^4 - 8*del^2*th^2 + 8*th^4)*/
+            den = pow(delta,4) - 8*delta*delta*theta*theta + 8*pow(theta,4);
+            
+            coefs[0] = (32*pow(theta,3)- 16*delta*delta*theta)/den;
+            coefs[1] = (8*delta*delta - 48*theta*theta)/den;
+            coefs[2] = 32*theta/den;
+            coefs[3] = -8/den;
+            
+            break;
+      }
+   }
+   *coefs_ptr = coefs;
+
+   if (scale)
+   {
+      /*grab 1/sqrt(diagonal) */
+      ds_data = hypre_CTAlloc(HYPRE_Real, num_rows);
+      
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j,diag) HYPRE_SMP_SCHEDULE 
+#endif
+      for (j = 0; j < num_rows; j++)
+      {
+         diag = A_diag_data[A_diag_i[j]];
+         ds_data[j] = 1/sqrt(diag);
+      }
+
+   }/* end of scaling code */
+   *ds_ptr = ds_data;
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int hypre_ParCSRRelax_Cheby_Solve(hypre_ParCSRMatrix *A, /* matrix to relax with */
+                            hypre_ParVector *f,    /* right-hand side */
+                            HYPRE_Real *ds_data,
+                            HYPRE_Real *coefs,
+                            HYPRE_Int order,            /* polynomial order */
+                            HYPRE_Int scale,            /* scale by diagonal?*/
+                            HYPRE_Int variant,           
+                            hypre_ParVector *u,   /* initial/updated approximation */
+                            hypre_ParVector *v    /* temporary vector */,
+                            hypre_ParVector *r    /*another temp vector */  )
+{
+   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Real *u_data = hypre_VectorData(hypre_ParVectorLocalVector(u));
+   HYPRE_Real *f_data = hypre_VectorData(hypre_ParVectorLocalVector(f));
+   HYPRE_Real *v_data = hypre_VectorData(hypre_ParVectorLocalVector(v));
+
+   HYPRE_Real  *r_data = hypre_VectorData(hypre_ParVectorLocalVector(r));
+
+   HYPRE_Int i, j;
+   HYPRE_Int num_rows = hypre_CSRMatrixNumRows(A_diag);
+ 
+   HYPRE_Real mult;
+   HYPRE_Real *orig_u;
+   
+   HYPRE_Int cheby_order;
+
+   HYPRE_Real  *tmp_data;
+
+   hypre_ParVector    *tmp_vec;
+
+   /* u = u + p(A)r */
+
+   if (order > 4)
+      order = 4;
+   if (order < 1)
+      order = 1;
+
+   /* we are using the order of p(A) */
+   cheby_order = order -1;
+   
+   orig_u = hypre_CTAlloc(HYPRE_Real, num_rows);
+
+   if (!scale)
+   {
+      /* get residual: r = f - A*u */
+      hypre_ParVectorCopy(f, r); 
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
+
+      for ( i = 0; i < num_rows; i++ ) 
+      {
+         orig_u[i] = u_data[i];
+         u_data[i] = r_data[i] * coefs[cheby_order]; 
+      }
+      for (i = cheby_order - 1; i >= 0; i-- ) 
+      {
+         hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v);
+         mult = coefs[i];
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+         for ( j = 0; j < num_rows; j++ )
+         {
+            u_data[j] = mult * r_data[j] + v_data[j];
+         }
+      }
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( i = 0; i < num_rows; i++ ) 
+      {
+         u_data[i] = orig_u[i] + u_data[i];
+      }
+   }
+   else /* scaling! */
+   {
+      
+      /*grab 1/sqrt(diagonal) */
+      
+      tmp_vec = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
+                                      hypre_ParCSRMatrixGlobalNumRows(A),
+                                      hypre_ParCSRMatrixRowStarts(A));
+      hypre_ParVectorInitialize(tmp_vec);
+      hypre_ParVectorSetPartitioningOwner(tmp_vec,0);
+      tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
+
+    /* get ds_data and get scaled residual: r = D^(-1/2)f -
+       * D^(-1/2)A*u */
+
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( j = 0; j < num_rows; j++ ) 
+      {
+         r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]);
+      }
+
+      /* save original u, then start 
+         the iteration by multiplying r by the cheby coef.*/
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( j = 0; j < num_rows; j++ ) 
+      {
+         orig_u[j] = u_data[j]; /* orig, unscaled u */
+
+         u_data[j] = r_data[j] * coefs[cheby_order]; 
+      }
+
+      /* now do the other coefficients */   
+      for (i = cheby_order - 1; i >= 0; i-- ) 
+      {
+         /* v = D^(-1/2)AD^(-1/2)u */
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+         for ( j = 0; j < num_rows; j++ )
+         {
+            tmp_data[j]  =  ds_data[j] * u_data[j];
+         }
+         hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v);
+
+         /* u_new = coef*r + v*/
+         mult = coefs[i];
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+         for ( j = 0; j < num_rows; j++ )
+         {
+            u_data[j] = mult * r_data[j] + ds_data[j]*v_data[j];
+         }
+         
+      } /* end of cheby_order loop */
+
+      /* now we have to scale u_data before adding it to u_orig*/
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE 
+#endif
+      for ( j = 0; j < num_rows; j++ ) 
+      {
+         u_data[j] = orig_u[j] + ds_data[j]*u_data[j];
+      }
+   
+      hypre_ParVectorDestroy(tmp_vec);  
+
+   }/* end of scaling code */
+
+   hypre_TFree(orig_u);
+  
+   return hypre_error_flag;
+}
+
diff --git a/src/parcsr_ls/par_coarsen.c b/src/parcsr_ls/par_coarsen.c
index 96fda4b..15e78bf 100644
--- a/src/parcsr_ls/par_coarsen.c
+++ b/src/parcsr_ls/par_coarsen.c
@@ -1118,7 +1118,8 @@ hypre_BoomerAMGCoarsenRuge( hypre_ParCSRMatrix    *S,
          }
          else
          {
-            if (measure < 0) hypre_printf("negative measure!\n");
+	    if (measure < 0) hypre_error_w_msg(HYPRE_ERROR_GENERIC,"negative measure!\n");
+            /*if (measure < 0) hypre_printf("negative measure!\n");*/
             CF_marker[j] = f_pnt;
             for (k = S_i[j]; k < S_i[j+1]; k++)
             {
diff --git a/src/parcsr_ls/par_cr.c b/src/parcsr_ls/par_cr.c
index fb3c809..4ae1241 100644
--- a/src/parcsr_ls/par_cr.c
+++ b/src/parcsr_ls/par_cr.c
@@ -90,7 +90,7 @@ HYPRE_Int hypre_cr(HYPRE_Int *A_i, HYPRE_Int *A_j, HYPRE_Real *A_data, HYPRE_Int
    hypre_fprintf(stdout,"-----------------------\n");
 
    for (i = 0; i < n; i++) 
-      e1[i] = 1.0e0+.1*rand();
+      e1[i] = 1.0e0+.1*hypre_RandI();
    
   /* stages */
    while(1){
@@ -133,8 +133,8 @@ HYPRE_Int hypre_cr(HYPRE_Int *A_i, HYPRE_Int *A_j, HYPRE_Real *A_data, HYPRE_Int
 	    if (cf[i] ==  cpt) 
                nc+=1.0e0;
 	    else if (cf[i] ==  fpt){ 
-               e0[i] = 1.0e0+.1*rand();
-               e1[i] = 1.0e0+.1*rand();
+               e0[i] = 1.0e0+.1*hypre_RandI();
+               e1[i] = 1.0e0+.1*hypre_RandI();
             }
          }
          nstages += 1;
@@ -2735,7 +2735,7 @@ hypre_BoomerAMGCoarsenCR( hypre_ParCSRMatrix    *A,
 
    for (i = 0; i < num_variables; i++) 
       e1[i] = 1.0e0;
-      /*e1[i] = 1.0e0+.1*rand();*/
+      /*e1[i] = 1.0e0+.1*hypre_RandI();*/
    
   /* stages */
    while(1)
@@ -2852,7 +2852,7 @@ hypre_BoomerAMGCoarsenCR( hypre_ParCSRMatrix    *A,
                if (CF_marker[i] ==  fpt)
                {
                   e1[i] = 1.0e0;
-                  /*e1[i] = 1.0e0+.1*rand();*/
+                  /*e1[i] = 1.0e0+.1*hypre_RandI();*/
                   e0[i] = e1[i];
                }
             }
@@ -3011,8 +3011,8 @@ hypre_BoomerAMGCoarsenCR( hypre_ParCSRMatrix    *A,
                   num_coarse++;
 	       else if (CF_marker[i] ==  fpt)
                { 
-                  e0[i] = 1.0e0+.1*rand();
-                  e1[i] = 1.0e0+.1*rand();
+                  e0[i] = 1.0e0+.1*hypre_RandI();
+                  e1[i] = 1.0e0+.1*hypre_RandI();
                }
             }
          }
@@ -3035,8 +3035,8 @@ hypre_BoomerAMGCoarsenCR( hypre_ParCSRMatrix    *A,
                   for (j=0; j < num_functions; j++)
                   { 
 		     /*CF_marker[jj] = CFN_marker[i];
-                     e0[jj] = 1.0e0+.1*rand();
-                     e1[jj++] = 1.0e0+.1*rand();*/
+                     e0[jj] = 1.0e0+.1*hypre_RandI();
+                     e1[jj++] = 1.0e0+.1*hypre_RandI();*/
                      e0[jj] = 1.0e0;
                      e1[jj++] = 1.0e0;
                   } 
diff --git a/src/parcsr_ls/par_cycle.c b/src/parcsr_ls/par_cycle.c
index 1f09407..b46892f 100644
--- a/src/parcsr_ls/par_cycle.c
+++ b/src/parcsr_ls/par_cycle.c
@@ -76,10 +76,7 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
 
    HYPRE_Int     block_mode;
 
-   HYPRE_Real  *max_eig_est;
-   HYPRE_Real  *min_eig_est;
    HYPRE_Int      cheby_order;
-   HYPRE_Real   cheby_fraction;
 
  /* Local variables  */
    HYPRE_Int      *lev_counter;
@@ -111,6 +108,8 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
    HYPRE_Real    alpha;
    HYPRE_Real  **l1_norms = NULL;
    HYPRE_Real   *l1_norms_level;
+   HYPRE_Real   **ds = hypre_ParAMGDataChebyDS(amg_data);
+   HYPRE_Real   **coefs = hypre_ParAMGDataChebyCoefs(amg_data);
 
    HYPRE_Int seq_cg = 0;
 
@@ -158,10 +157,10 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
    l1_norms            = hypre_ParAMGDataL1Norms(amg_data);
    /* smooth_option       = hypre_ParAMGDataSmoothOption(amg_data); */
 
-   max_eig_est = hypre_ParAMGDataMaxEigEst(amg_data);
+   /*max_eig_est = hypre_ParAMGDataMaxEigEst(amg_data);
    min_eig_est = hypre_ParAMGDataMinEigEst(amg_data);
+   cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);*/
    cheby_order = hypre_ParAMGDataChebyOrder(amg_data);
-   cheby_fraction = hypre_ParAMGDataChebyFraction(amg_data);
 
    cycle_op_count = hypre_ParAMGDataCycleOpCount(amg_data);
 
@@ -434,6 +433,18 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
                  }
                  else /* not CF - so use through AMS */
                  {
+#ifdef HYPRE_USE_GPU
+		   hypre_ParCSRRelax(A_array[level], 
+                                       Aux_F,
+                                       1,
+                                       1,
+                                       l1_norms_level,
+                                       relax_weight[level],
+                                       omega[level],0,0,0,0,
+                                       Aux_U,
+                                       Vtemp, 
+				       Ztemp);
+#else
                     if (num_threads == 1)
                        hypre_ParCSRRelax(A_array[level],
                                        Aux_F,
@@ -457,6 +468,7 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
                                               Aux_U,
                                               Vtemp,
                                               Ztemp);
+#endif
                  }
               }
               else if (relax_type == 15)
@@ -470,13 +482,11 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
               }
               else if (relax_type == 16)
               { /* scaled Chebyshev */
-                 HYPRE_Int scale = 1;
-                 HYPRE_Int variant = 0;
-                 hypre_ParCSRRelax_Cheby(A_array[level],
-                                       Aux_F,
-                                       max_eig_est[level],
-                                       min_eig_est[level],
-                                       cheby_fraction, cheby_order, scale,
+                 HYPRE_Int scale = hypre_ParAMGDataChebyScale(amg_data);
+                 HYPRE_Int variant = hypre_ParAMGDataChebyVariant(amg_data);
+                 hypre_ParCSRRelax_Cheby_Solve(A_array[level], Aux_F,
+                                       ds[level], coefs[level],
+                                       cheby_order, scale,
                                        variant, Aux_U, Vtemp, Ztemp );
               }
               else if (relax_type ==17)
diff --git a/src/parcsr_ls/par_gsmg.c b/src/parcsr_ls/par_gsmg.c
index 55d28e6..6ee30ac 100644
--- a/src/parcsr_ls/par_gsmg.c
+++ b/src/parcsr_ls/par_gsmg.c
@@ -563,7 +563,7 @@ hypre_BoomerAMGCreateSmoothVecs(void         *data,
    for (sample=0; sample<nsamples; sample++)
    {
        for (i=0; i<n_local; i++)
-           datax[i] = (rand()/(HYPRE_Real)RAND_MAX) - .5;
+           datax[i] = hypre_Rand() - .5;
 
        for (i=0; i<num_sweeps; i++)
        {
diff --git a/src/parcsr_ls/par_jacobi_interp.c b/src/parcsr_ls/par_jacobi_interp.c
index 2434136..99b2702 100644
--- a/src/parcsr_ls/par_jacobi_interp.c
+++ b/src/parcsr_ls/par_jacobi_interp.c
@@ -189,7 +189,7 @@ void hypre_BoomerAMGJacobiInterp_1( hypre_ParCSRMatrix * A,
    {
       if ( J_marker[i]<0 )
       {
-         if ( ((HYPRE_Real)rand())/RAND_MAX < randthresh )
+         if ( ((HYPRE_Real)hypre_Rand()) < randthresh )
          {
             hypre_printf( "%i: ", i );
             for ( m=P_diag_i[i]; m<P_diag_i[i+1]; ++m )
diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c
index 66e6b0c..dc3dce3 100644
--- a/src/parcsr_ls/par_relax.c
+++ b/src/parcsr_ls/par_relax.c
@@ -2368,27 +2368,31 @@ HYPRE_Int  hypre_BoomerAMGRelax( hypre_ParCSRMatrix *A,
          /*-----------------------------------------------------------------
           * Copy f into temporary vector.
           *-----------------------------------------------------------------*/
-
+         PUSH_RANGE("RELAX",4);
+#ifdef HYPRE_USE_GPU
+           hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(Vtemp));
+           hypre_SeqVectorPrefetchToDevice(hypre_ParVectorLocalVector(f));
+         VecCopy(Vtemp_data,f_data,hypre_VectorSize(hypre_ParVectorLocalVector(Vtemp)),HYPRE_STREAM(4));
+#else
          hypre_ParVectorCopy(f,Vtemp);
-
+#endif 
          /*-----------------------------------------------------------------
           * Perform Matvec Vtemp=f-Au
           *-----------------------------------------------------------------*/
 
-            hypre_ParCSRMatrixMatvec(-1.0,A, u, 1.0, Vtemp);
+            hypre_ParCSRMatrixMatvec(-relax_weight,A, u, relax_weight, Vtemp);
+#ifdef HYPRE_USE_GPU
+         VecScale(u_data,Vtemp_data,l1_norms,n,HYPRE_STREAM(4));
+#else
             for (i = 0; i < n; i++)
             {
-
                /*-----------------------------------------------------------
                 * If diagonal is nonzero, relax point i; otherwise, skip it.
                 *-----------------------------------------------------------*/
-
-               if (A_diag_data[A_diag_i[i]] != zero)
-               {
-                  u_data[i] += relax_weight * Vtemp_data[i]
-                                / A_diag_data[A_diag_i[i]];
-               }
+                  u_data[i] += Vtemp_data[i] / l1_norms[i];
             }
+#endif
+         POP_RANGE;
       }
       break;
 
@@ -4372,7 +4376,7 @@ HYPRE_Int hypre_GaussElimSolve (hypre_ParAMGData *amg_data, HYPRE_Int level, HYP
    return hypre_error_flag;
 }
 
-
+HYPRE_CUDA_GLOBAL
 HYPRE_Int gselim(HYPRE_Real *A,
                  HYPRE_Real *x,
                  HYPRE_Int n)
diff --git a/src/parcsr_ls/par_relax_more.c b/src/parcsr_ls/par_relax_more.c
index 0e9e33b..f3454d5 100644
--- a/src/parcsr_ls/par_relax_more.c
+++ b/src/parcsr_ls/par_relax_more.c
@@ -26,52 +26,61 @@ HYPRE_Int hypre_ParCSRMaxEigEstimate(hypre_ParCSRMatrix *A, /* matrix to relax w
                               
    HYPRE_Real e_max;
    HYPRE_Real row_sum, max_norm;
-   HYPRE_Real *col_val;
+   HYPRE_Real *A_diag_data;
+   HYPRE_Real *A_offd_data;
    HYPRE_Real temp;
    HYPRE_Real diag_value;
 
    HYPRE_Int   pos_diag, neg_diag;
-   HYPRE_Int   start_row, end_row;
-   HYPRE_Int   row_length;
-   HYPRE_Int *col_ind;
+   HYPRE_Int  A_num_rows;
+   HYPRE_Int *A_diag_i;
+   HYPRE_Int *A_offd_i;
    HYPRE_Int   j;
-   HYPRE_Int i;
+   HYPRE_Int i, start;
    
 
    /* estimate with the inf-norm of A - should be ok for SPD matrices */
 
-   start_row  = hypre_ParCSRMatrixFirstRowIndex(A);
-   end_row    =  hypre_ParCSRMatrixLastRowIndex(A);
+   A_num_rows  =  hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
+   A_diag_i    =  hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(A));
+   A_diag_data =  hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(A));
+   A_offd_i    =  hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(A));
+   A_offd_data =  hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(A));
     
    max_norm = 0.0;
 
    pos_diag = neg_diag = 0;
  
-   for ( i = start_row; i <= end_row; i++ )
+   for ( i = 0; i < A_num_rows; i++ )
    {
-      HYPRE_ParCSRMatrixGetRow((HYPRE_ParCSRMatrix) A, i, &row_length, &col_ind, &col_val);
-
-      row_sum = 0.0;
-
-      for (j = 0; j < row_length; j++)
+      start = A_diag_i[i];
+      diag_value = A_diag_data[start];
+      if (diag_value > 0) 
       {
-         if (j==0) diag_value = fabs(col_val[j]);
-     
-         row_sum += fabs(col_val[j]);
+         pos_diag++;
+      }
+      if (diag_value < 0) 
+      {
+         neg_diag++;
+         diag_value = -diag_value;
+      }
+      row_sum = diag_value;
 
-         if ( col_ind[j] == i && col_val[j] > 0.0 ) pos_diag++;
-         if ( col_ind[j] == i && col_val[j] < 0.0 ) neg_diag++;
+      /*for (j = 0; j < row_length; j++)*/
+      for (j = start+1; j < A_diag_i[i+1]; j++)
+      {
+         row_sum += fabs(A_diag_data[j]);
+      }
+      for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
+      {
+         row_sum += fabs(A_offd_data[j]);
       }
       if (scale)
       {
          if (diag_value != 0.0)
             row_sum = row_sum/diag_value;
       }
-      
-
       if ( row_sum > max_norm ) max_norm = row_sum;
-
-      HYPRE_ParCSRMatrixRestoreRow((HYPRE_ParCSRMatrix) A, i, &row_length, &col_ind, &col_val);
    }
 
    /* get max across procs */
diff --git a/src/parcsr_ls/par_stats.c b/src/parcsr_ls/par_stats.c
index 8bb9b96..df34ec3 100644
--- a/src/parcsr_ls/par_stats.c
+++ b/src/parcsr_ls/par_stats.c
@@ -155,6 +155,7 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
    HYPRE_Int additive;
    HYPRE_Int mult_additive;
    HYPRE_Int simple;
+   HYPRE_Int add_end;
    HYPRE_Int add_rlx;
    HYPRE_Real add_rlx_wt;
  
@@ -175,10 +176,10 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
    additive = hypre_ParAMGDataAdditive(amg_data);
    mult_additive = hypre_ParAMGDataMultAdditive(amg_data);
    simple = hypre_ParAMGDataSimple(amg_data);
+   add_end = hypre_ParAMGDataAddLastLvl(amg_data);
    add_rlx = hypre_ParAMGDataAddRelaxType(amg_data);
    add_rlx_wt = hypre_ParAMGDataAddRelaxWt(amg_data);
 
-
    A_block_array = hypre_ParAMGDataABlockArray(amg_data);
    P_block_array = hypre_ParAMGDataPBlockArray(amg_data);
 
@@ -923,19 +924,40 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
 
       if (additive == 0 || mult_additive == 0 || simple == 0)
       {
+         HYPRE_Int add_lvl = add_end;
+         if (add_end == -1) add_lvl = num_levels-1;
          if (additive > -1)
-            hypre_printf( "  Additive V-cycle starting at level %d  \n", additive);
+            hypre_printf( "  Additive V-cycle 1st level %d last level %d: \n", additive, add_lvl);
          if (mult_additive > -1)
-            hypre_printf( "  Mult-Additive V-cycle starting at level %d  \n", mult_additive);
+            hypre_printf( "  Mult-Additive V-cycle 1st level %d last level %d: \n", mult_additive, add_lvl);
          if (simple > -1)
-            hypre_printf( "  Simplified Mult-Additive V-cycle starting at level %d  \n", simple);
-         hypre_printf( "\n");
+            hypre_printf( "  Simplified Mult-Additive V-cycle 1st level %d: last level %d \n", simple, add_lvl);
          hypre_printf( "  Relaxation Parameters:\n");
-         hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
-         hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
+         if (add_lvl == num_levels-1)
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
               num_grid_sweeps[1],
-              num_grid_sweeps[2],(2*num_grid_sweeps[3]));
-         hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+              num_grid_sweeps[1],(2*num_grid_sweeps[1]));
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+         }
+         else
+         {
+            hypre_printf( "   Visiting Grid:                     down   up\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d\n",
+              num_grid_sweeps[1], num_grid_sweeps[1]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d\n", add_rlx, add_rlx);
+         }
+         if (add_lvl < num_levels -1)
+         {
+            hypre_printf( " \n");
+            hypre_printf( "Multiplicative portion: \n");
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d\n",
+              num_grid_sweeps[1], num_grid_sweeps[2], num_grid_sweeps[3]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  %4d\n",
+              grid_relax_type[1], grid_relax_type[2], grid_relax_type[3]);
+         }
          if (add_rlx == 0) hypre_printf( "   Relaxation Weight:   %e \n", add_rlx_wt);
          hypre_printf( "   Point types, partial sweeps (1=C, -1=F):\n");
          hypre_printf( "                  Pre-CG relaxation (down):");
@@ -953,12 +975,25 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
       }
       else if (additive > 0 || mult_additive > 0 || simple > 0)
       {
+         HYPRE_Int add_lvl = add_end;
+         if (add_end == -1) add_lvl = num_levels-1;
          hypre_printf( "  Relaxation Parameters:\n");
-         hypre_printf( "   Visiting Grid:                     down   up  \n");
-         hypre_printf( "            Number of sweeps:         %4d   %2d  \n",
+         if (add_lvl < num_levels -1)
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d\n",
+              num_grid_sweeps[1], num_grid_sweeps[2], num_grid_sweeps[3]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  %4d\n",
+              grid_relax_type[1], grid_relax_type[2], grid_relax_type[3]);
+         }
+         else
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  \n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  \n",
               num_grid_sweeps[1], num_grid_sweeps[2]);
-         hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  \n",
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:   %4d   %2d  \n",
               grid_relax_type[1], grid_relax_type[2]);
+         }
          hypre_printf( "   Point types, partial sweeps (1=C, -1=F):\n");
          if (grid_relax_points && grid_relax_type[1] != 8)
          {
@@ -995,18 +1030,27 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
          }
          hypre_printf( "\n\n");
          if (additive > -1)
-            hypre_printf( "  Additive V-cycle starting at level %d  \n", additive);
+            hypre_printf( "  Additive V-cycle 1st level %d last level %d:  \n", additive, add_lvl);
          if (mult_additive > -1)
-            hypre_printf( "  Mult-Additive V-cycle starting at level %d  \n", mult_additive);
+            hypre_printf( "  Mult-Additive V-cycle 1st level %d last level %d: \n", mult_additive, add_lvl);
          if (simple > -1)
-            hypre_printf( "  Simplified Mult-Additive V-cycle starting at level %d  \n", simple);
-         hypre_printf( "\n");
+            hypre_printf( "  Simplified Mult-Additive V-cycle 1st level %d: last level %d  \n", simple, add_lvl);
          hypre_printf( "  Relaxation Parameters:\n");
-         hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
-         hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
+         if (add_lvl == num_levels-1)
+         {
+            hypre_printf( "   Visiting Grid:                     down   up  coarse\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d  %4d \n",
               num_grid_sweeps[1],
-              num_grid_sweeps[2],(2*num_grid_sweeps[3]));
-         hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+              num_grid_sweeps[1],(2*num_grid_sweeps[1]));
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d   %2d \n", add_rlx, add_rlx, add_rlx);
+         }
+         else
+         {
+            hypre_printf( "   Visiting Grid:                     down   up\n");
+            hypre_printf( "            Number of sweeps:         %4d   %2d\n",
+              num_grid_sweeps[1], num_grid_sweeps[1]);
+            hypre_printf( "   Type 0=Jac, 3=hGS, 6=hSGS, 9=GE:    %2d   %2d\n", add_rlx, add_rlx);
+         }
          if (add_rlx == 0) hypre_printf( "   Relaxation Weight:   %e \n", add_rlx_wt);
          hypre_printf( "   Point types, partial sweeps (1=C, -1=F):\n");
          hypre_printf( "                  Pre-CG relaxation (down):");
diff --git a/src/parcsr_ls/par_strength.c b/src/parcsr_ls/par_strength.c
index 4bb3214..4ecd387 100644
--- a/src/parcsr_ls/par_strength.c
+++ b/src/parcsr_ls/par_strength.c
@@ -1497,6 +1497,7 @@ HYPRE_Int hypre_BoomerAMGCreate2ndS( hypre_ParCSRMatrix *S, HYPRE_Int *CF_marker
 
       temp = hypre_UnorderedIntSetCopyToArray(&found_set, &num_cols_offd_C);
       
+      hypre_UnorderedIntSetDestroy(&found_set);
       hypre_TFree(S_ext_i);
       hypre_TFree(S_ext_j);
 
diff --git a/src/parcsr_ls/par_vardifconv.c b/src/parcsr_ls/par_vardifconv.c
index 50b67e3..ae16a8a 100644
--- a/src/parcsr_ls/par_vardifconv.c
+++ b/src/parcsr_ls/par_vardifconv.c
@@ -351,10 +351,6 @@ GenerateVarDifConv( MPI_Comm comm,
             }
    }
 
-   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
-   rhs = hypre_ParVectorLocalVector(par_rhs);
-   hypre_VectorData(rhs) = rhs_data;
-
 #ifdef HYPRE_NO_GLOBAL_PARTITION
 /* ideally we would use less storage earlier in this function, but this is fine
    for testing */
@@ -369,6 +365,11 @@ GenerateVarDifConv( MPI_Comm comm,
    }
 #endif
 
+   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
+   hypre_ParVectorOwnsPartitioning(par_rhs) = 0;
+   rhs = hypre_ParVectorLocalVector(par_rhs);
+   hypre_VectorData(rhs) = rhs_data;
+
    A = hypre_ParCSRMatrixCreate(comm, grid_size, grid_size,
                                 global_part, global_part, num_cols_offd,
                                 diag_i[local_num_rows],
diff --git a/src/parcsr_ls/par_vardifconv.c b/src/parcsr_ls/par_vardifconv_rs.c
similarity index 62%
copy from src/parcsr_ls/par_vardifconv.c
copy to src/parcsr_ls/par_vardifconv_rs.c
index 50b67e3..0d2a8cf 100644
--- a/src/parcsr_ls/par_vardifconv.c
+++ b/src/parcsr_ls/par_vardifconv_rs.c
@@ -16,12 +16,17 @@
  
 #include "_hypre_parcsr_ls.h"
  
+/* examples in Ruge & Stuben paper */
+static HYPRE_Int rs_example = 1;
+static HYPRE_Real rs_l = 3.0;
+
 /*--------------------------------------------------------------------------
- * hypre_GenerateVarDifConv
+ * hypre_GenerateVarDifConv: with the FD discretization and examples 
+ *                           in Ruge-Stuben's paper ``Algebraic Multigrid''
  *--------------------------------------------------------------------------*/
 
 HYPRE_ParCSRMatrix 
-GenerateVarDifConv( MPI_Comm comm,
+GenerateRSVarDifConv( MPI_Comm comm,
                  HYPRE_Int      nx,
                  HYPRE_Int      ny,
                  HYPRE_Int      nz, 
@@ -32,7 +37,8 @@ GenerateVarDifConv( MPI_Comm comm,
                  HYPRE_Int      q,
                  HYPRE_Int      r,
                  HYPRE_Real eps,
-		 HYPRE_ParVector *rhs_ptr)
+		 HYPRE_ParVector *rhs_ptr,
+                 HYPRE_Int type)
 {
    hypre_ParCSRMatrix *A;
    hypre_CSRMatrix *diag;
@@ -71,11 +77,17 @@ GenerateVarDifConv( MPI_Comm comm,
 
    HYPRE_Real hhx, hhy, hhz;
    HYPRE_Real xx, yy, zz;
-   HYPRE_Real afp, afm, bfp, bfm, cfp, cfm, df, ef, ff, gf;
+   HYPRE_Real afp, afm, bfp, bfm, cfp, cfm, di, ai, mux, ei, bi,
+              muy, fi, ci, muz, dfm, dfp, efm, efp, ffm, ffp, gi;
 
    hypre_MPI_Comm_size(comm,&num_procs);
    hypre_MPI_Comm_rank(comm,&my_id);
 
+   if (type >= 1 && type <= 3)
+   {
+     rs_example = type;
+   }
+
    grid_size = nx*ny*nz;
 
    hypre_GeneratePartitioning(nx,P,&nx_part);
@@ -221,115 +233,176 @@ GenerateVarDifConv( MPI_Comm comm,
    for (iz = nz_part[r]; iz < nz_part[r+1]; iz++)
    {
       zz = (HYPRE_Real)(iz+1)*hhz;
-      for (iy = ny_part[q];  iy < ny_part[q+1]; iy++)
+      for (iy = ny_part[q]; iy < ny_part[q+1]; iy++)
       {
          yy = (HYPRE_Real)(iy+1)*hhy;
          for (ix = nx_part[p]; ix < nx_part[p+1]; ix++)
          {
             xx = (HYPRE_Real)(ix+1)*hhx;
-	    afp = eps*afun(xx+0.5*hhx,yy,zz)/hhx/hhx;
-	    afm = eps*afun(xx-0.5*hhx,yy,zz)/hhx/hhx;
-	    bfp = eps*bfun(xx,yy+0.5*hhy,zz)/hhy/hhy;
-	    bfm = eps*bfun(xx,yy-0.5*hhy,zz)/hhy/hhy;
-	    cfp = eps*cfun(xx,yy,zz+0.5*hhz)/hhz/hhz;
-	    cfm = eps*cfun(xx,yy,zz-0.5*hhz)/hhz/hhz;
-	    df = dfun(xx,yy,zz)/hhx;
-	    ef = efun(xx,yy,zz)/hhy;
-	    ff = ffun(xx,yy,zz)/hhz;
-	    gf = gfun(xx,yy,zz);
+	    afp = -eps*afun_rs(xx+0.5*hhx,yy,zz)/hhx/hhx;
+	    afm = -eps*afun_rs(xx-0.5*hhx,yy,zz)/hhx/hhx;
+	    bfp = -eps*bfun_rs(xx,yy+0.5*hhy,zz)/hhy/hhy;
+	    bfm = -eps*bfun_rs(xx,yy-0.5*hhy,zz)/hhy/hhy;
+	    cfp = -eps*cfun_rs(xx,yy,zz+0.5*hhz)/hhz/hhz;
+	    cfm = -eps*cfun_rs(xx,yy,zz-0.5*hhz)/hhz/hhz;
+            /* first order terms */
+            /* x-direction */
+            di = dfun_rs(xx, yy, zz);
+            ai = afun_rs(xx, yy, zz);
+            if (di * hhx > eps * ai)
+            {
+              mux = eps * ai / (2.0 * di * hhx);
+            }
+            else if (di * hhx < -eps * ai)
+            {
+              mux = 1.0 + eps * ai / (2.0 * di * hhx);
+            }
+            else
+            {
+              mux = 0.5;
+            }
+            /* y-direction */
+            ei = efun_rs(xx, yy, zz);
+            bi = bfun_rs(xx, yy, zz);
+            if (ei * hhy > eps * bi)
+            {
+              muy = eps * bi / (2.0 * ei * hhy);
+            }
+            else if (ei * hhy < -eps * bi)
+            {
+              muy = 1.0 + eps * bi / (2.0 * ei * hhy);
+            }
+            else
+            {
+              muy = 0.5;
+            }
+            /* z-direction */
+            fi = ffun_rs(xx, yy, zz);
+            ci = cfun_rs(xx, yy, zz);
+            if (fi * hhz > eps * ci)
+            {
+              muz = eps * ci / (2.0 * fi * hhz);
+            }
+            else if (fi * hhz < -eps * ci)
+            {
+              muz = 1.0 + eps * ci / (2.0 * fi * hhz);
+            }
+            else
+            {
+              muz = 0.5;
+            }
+
+	    dfm = di * (mux - 1.0) / hhx;
+            dfp = di * mux / hhx;
+	    efm = ei * (muy - 1.0) / hhy;
+            efp = ei * muy / hhy;
+            ffm = fi * (muz - 1.0) / hhz;
+	    ffp = fi * muz / hhz;
+	    gi = gfun_rs(xx, yy, zz);
+            /* stencil: center */
             diag_j[cnt] = row_index;
-            diag_data[cnt++] = afp+afm+bfp+bfm+cfp+cfm+gf-df-ef-ff;
-	    rhs_data[row_index] = rfun(xx,yy,zz);
-	    if (ix == 0) rhs_data[row_index] += afm*bndfun(0,yy,zz);
-	    if (iy == 0) rhs_data[row_index] += bfm*bndfun(xx,0,zz);
-	    if (iz == 0) rhs_data[row_index] += cfm*bndfun(xx,yy,0);
-	    if (ix+1 == nx) rhs_data[row_index] += (afp-df)*bndfun(1.0,yy,zz);
-	    if (iy+1 == ny) rhs_data[row_index] += (bfp-ef)*bndfun(xx,1.0,zz);
-	    if (iz+1 == nz) rhs_data[row_index] += (cfp-ff)*bndfun(xx,yy,1.0);
+            diag_data[cnt++] = -(afp + afm + bfp + bfm + cfp + cfm  +
+                                 dfp + dfm + efp + efm + ffp + ffm) + gi;
+            /* rhs vector */
+	    rhs_data[row_index] = rfun_rs(xx,yy,zz);
+	    /* apply boundary conditions */
+            if (ix == 0)    rhs_data[row_index] -= (afm+dfm) * bndfun_rs(0,yy,zz);
+	    if (iy == 0)    rhs_data[row_index] -= (bfm+efm) * bndfun_rs(xx,0,zz);
+	    if (iz == 0)    rhs_data[row_index] -= (cfm+ffm) * bndfun_rs(xx,yy,0);
+	    if (ix+1 == nx) rhs_data[row_index] -= (afp+dfp) * bndfun_rs(1.0,yy,zz);
+	    if (iy+1 == ny) rhs_data[row_index] -= (bfp+efp) * bndfun_rs(xx,1.0,zz);
+	    if (iz+1 == nz) rhs_data[row_index] -= (cfp+ffp) * bndfun_rs(xx,yy,1.0);
+            /* stencil: z- */
             if (iz > nz_part[r]) 
             {
-               diag_j[cnt] = row_index-nx_local*ny_local;
-               diag_data[cnt++] = -cfm;
+               diag_j[cnt] = row_index - nx_local*ny_local;
+               diag_data[cnt++] = cfm + ffm;
             }
             else
             {
                if (iz) 
                {
                   offd_j[o_cnt] = hypre_map(ix,iy,iz-1,p,q,r-1,P,Q,R,
-                                      nx_part,ny_part,nz_part,global_part);
-                  offd_data[o_cnt++] = -cfm;
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = cfm + ffm;
                }
             }
+            /* stencil: y- */
             if (iy > ny_part[q]) 
             {
-               diag_j[cnt] = row_index-nx_local;
-               diag_data[cnt++] = -bfm;
+               diag_j[cnt] = row_index - nx_local;
+               diag_data[cnt++] = bfm + efm;
             }
             else
             {
                if (iy) 
                {
                   offd_j[o_cnt] = hypre_map(ix,iy-1,iz,p,q-1,r,P,Q,R,
-                                      nx_part,ny_part,nz_part,global_part);
-                  offd_data[o_cnt++] = -bfm;
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = bfm + efm;
                }
             }
+            /* stencil: x- */
             if (ix > nx_part[p]) 
             {
-               diag_j[cnt] = row_index-1;
-               diag_data[cnt++] = -afm;
+               diag_j[cnt] = row_index - 1;
+               diag_data[cnt++] = afm + dfm;
             }
             else
             {
                if (ix) 
                {
                   offd_j[o_cnt] = hypre_map(ix-1,iy,iz,p-1,q,r,P,Q,R,
-                                      nx_part,ny_part,nz_part,global_part);
-                  offd_data[o_cnt++] = -afm;
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = afm + dfm;
                }
             }
+            /* stencil: x+ */
             if (ix+1 < nx_part[p+1]) 
             {
-               diag_j[cnt] = row_index+1;
-               diag_data[cnt++] = -afp+df;
+               diag_j[cnt] = row_index + 1;
+               diag_data[cnt++] = afp + dfp;
             }
             else
             {
                if (ix+1 < nx) 
                {
                   offd_j[o_cnt] = hypre_map(ix+1,iy,iz,p+1,q,r,P,Q,R,
-                                      nx_part,ny_part,nz_part,global_part);
-                  offd_data[o_cnt++] = -afp+df;
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = afp + dfp;
                }
             }
+            /* stencil: y+ */
             if (iy+1 < ny_part[q+1]) 
             {
-               diag_j[cnt] = row_index+nx_local;
-               diag_data[cnt++] = -bfp +ef;
+               diag_j[cnt] = row_index + nx_local;
+               diag_data[cnt++] = bfp + efp;
             }
             else
             {
                if (iy+1 < ny) 
                {
                   offd_j[o_cnt] = hypre_map(ix,iy+1,iz,p,q+1,r,P,Q,R,
-                                      nx_part,ny_part,nz_part,global_part);
-                  offd_data[o_cnt++] = -bfp+ef;
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = bfp + efp;
                }
             }
+            /* stencil: z+ */
             if (iz+1 < nz_part[r+1]) 
             {
-               diag_j[cnt] = row_index+nx_local*ny_local;
-               diag_data[cnt++] = -cfp+ff;
+               diag_j[cnt] = row_index + nx_local*ny_local;
+               diag_data[cnt++] = cfp + ffp;
             }
             else
             {
                if (iz+1 < nz) 
                {
                   offd_j[o_cnt] = hypre_map(ix,iy,iz+1,p,q,r+1,P,Q,R,
-                                      nx_part,ny_part,nz_part,global_part);
-                  offd_data[o_cnt++] = -cfp+ff;
+                                            nx_part,ny_part,nz_part,global_part);
+                  offd_data[o_cnt++] = cfp + ffp;
                }
             }
+            /* done with this row */
             row_index++;
          }
       }
@@ -351,10 +424,6 @@ GenerateVarDifConv( MPI_Comm comm,
             }
    }
 
-   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
-   rhs = hypre_ParVectorLocalVector(par_rhs);
-   hypre_VectorData(rhs) = rhs_data;
-
 #ifdef HYPRE_NO_GLOBAL_PARTITION
 /* ideally we would use less storage earlier in this function, but this is fine
    for testing */
@@ -369,6 +438,11 @@ GenerateVarDifConv( MPI_Comm comm,
    }
 #endif
 
+   par_rhs = hypre_ParVectorCreate(comm, grid_size, global_part);
+   hypre_ParVectorOwnsPartitioning(par_rhs) = 0;
+   rhs = hypre_ParVectorLocalVector(par_rhs);
+   hypre_VectorData(rhs) = rhs_data;
+
    A = hypre_ParCSRMatrixCreate(comm, grid_size, grid_size,
                                 global_part, global_part, num_cols_offd,
                                 diag_i[local_num_rows],
@@ -398,145 +472,88 @@ GenerateVarDifConv( MPI_Comm comm,
    return (HYPRE_ParCSRMatrix) A;
 }
 
-HYPRE_Real afun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real afun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   /* value = 1.0 + 1000.0*fabs(xx-yy); */
-   if ((xx < 0.1 && yy < 0.1 && zz < 0.1)
-      || (xx < 0.1 && yy < 0.1 && zz > 0.9)
-      || (xx < 0.1 && yy > 0.9 && zz < 0.1)
-      || (xx > 0.9 && yy < 0.1 && zz < 0.1)
-      || (xx > 0.9 && yy > 0.9 && zz < 0.1)
-      || (xx > 0.9 && yy < 0.1 && zz > 0.9)
-      || (xx < 0.1 && yy > 0.9 && zz > 0.9)
-      || (xx > 0.9 && yy > 0.9 && zz > 0.9))
-      value = 0.01;
-   else if (xx >= 0.1 && xx <= 0.9 
-	 && yy >= 0.1 && yy <= 0.9
-	 && zz >= 0.1 && zz <= 0.9)
-      value = 1000.0;
-   else   
-      value = 1.0 ;
-   /* HYPRE_Real value, pi;
-   pi = 4.0 * atan(1.0);
-   value = cos(pi*xx)*cos(pi*yy); */
+   value = 1.0;
    return value;
 }
 
-HYPRE_Real bfun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real bfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   /* value = 1.0 + 1000.0*fabs(xx-yy); */
-   if ((xx < 0.1 && yy < 0.1 && zz < 0.1)
-      || (xx < 0.1 && yy < 0.1 && zz > 0.9)
-      || (xx < 0.1 && yy > 0.9 && zz < 0.1)
-      || (xx > 0.9 && yy < 0.1 && zz < 0.1)
-      || (xx > 0.9 && yy > 0.9 && zz < 0.1)
-      || (xx > 0.9 && yy < 0.1 && zz > 0.9)
-      || (xx < 0.1 && yy > 0.9 && zz > 0.9)
-      || (xx > 0.9 && yy > 0.9 && zz > 0.9))
-      value = 0.01;
-   else if (xx >= 0.1 && xx <= 0.9 
-	 && yy >= 0.1 && yy <= 0.9
-	 && zz >= 0.1 && zz <= 0.9)
-      value = 1000.0;
-   else   
-      value = 1.0 ;
-   /* HYPRE_Real value, pi;
-   pi = 4.0 * atan(1.0);
-   value = 1.0 - 2.0*xx; 
-   value = cos(pi*xx)*cos(pi*yy); */
-   /* HYPRE_Real value;
-   value = 1.0 + 1000.0 * fabs(xx-yy); 
-   HYPRE_Real value, x0, y0;
-   x0 = fabs(xx - 0.5);
-   y0 = fabs(yy - 0.5);
-   if (y0 > x0) x0 = y0;
-   if (x0 >= 0.125 && x0 <= 0.25)
-      value = 1.0;
-   else
-      value = 1000.0;*/
+   value = 1.0;
    return value;
 }
 
-HYPRE_Real cfun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real cfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   if ((xx < 0.1 && yy < 0.1 && zz < 0.1)
-      || (xx < 0.1 && yy < 0.1 && zz > 0.9)
-      || (xx < 0.1 && yy > 0.9 && zz < 0.1)
-      || (xx > 0.9 && yy < 0.1 && zz < 0.1)
-      || (xx > 0.9 && yy > 0.9 && zz < 0.1)
-      || (xx > 0.9 && yy < 0.1 && zz > 0.9)
-      || (xx < 0.1 && yy > 0.9 && zz > 0.9)
-      || (xx > 0.9 && yy > 0.9 && zz > 0.9))
-      value = 0.01;
-   else if (xx >= 0.1 && xx <= 0.9 
-	 && yy >= 0.1 && yy <= 0.9
-	 && zz >= 0.1 && zz <= 0.9)
-      value = 1000.0;
-   else   
-      value = 1.0 ;
-   /*if (xx <= 0.75 && yy <= 0.75 && zz <= 0.75)
-      value = 0.1;
-   else if (xx > 0.75 && yy > 0.75 && zz > 0.75)
-      value = 100000;
-   else   
-      value = 1.0 ;*/
+   value = 1.0;
    return value;
 }
 
-HYPRE_Real dfun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real dfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   /*HYPRE_Real pi;
-   pi = 4.0 * atan(1.0);
-   value = -sin(pi*xx)*cos(pi*yy);*/
-   value = 0;
+   if (rs_example == 1)
+   {
+     value = sin(rs_l*M_PI/8.0);
+   }
+   else if (rs_example == 2)
+   {
+     value = (2.0*yy-1.0)*(1.0-xx*xx);
+   }
+   else
+   {
+     value = 4.0*xx*(xx-1.0)*(1.0-2.0*yy);
+   }
    return value;
 }
 
-HYPRE_Real efun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real efun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   /*HYPRE_Real pi;
-   pi = 4.0 * atan(1.0);
-   value = sin(pi*yy)*cos(pi*xx);*/
-   value = 0;
+   if (rs_example == 1)
+   {
+     value = cos(rs_l*M_PI/8.0);
+   }
+   else if (rs_example == 2)
+   {
+     value = 2.0*xx*yy*(yy-1.0);
+   }
+   else
+   {
+     value = -4.0*yy*(yy-1.0)*(1.0-2.0*xx);
+   }
    return value;
 }
 
-HYPRE_Real ffun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real ffun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   value = 0.0;
+   value = efun_rs(xx, yy, zz);
    return value;
 }
 
-HYPRE_Real gfun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real gfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
    value = 0.0;
    return value;
 }
 
-HYPRE_Real rfun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real rfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
-   /* HYPRE_Real value, pi;
-   pi = 4.0 * atan(1.0);
-   value = -4.0*pi*pi*sin(pi*xx)*sin(pi*yy)*cos(pi*xx)*cos(pi*yy); */
    HYPRE_Real value;
-   /* value = xx*(1.0-xx)*yy*(1.0-yy); */
    value = 1.0;
    return value;
 }
 
-HYPRE_Real bndfun(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
+HYPRE_Real bndfun_rs(HYPRE_Real xx, HYPRE_Real yy, HYPRE_Real zz)
 {
    HYPRE_Real value;
-   /*HYPRE_Real pi;
-   pi = 4.0 * atan(1.0);
-   value = sin(pi*xx)+sin(13*pi*xx)+sin(pi*yy)+sin(13*pi*yy);*/
    value = 0.0;
    return value;
 }
+
diff --git a/src/parcsr_mv/_hypre_parcsr_mv.h b/src/parcsr_mv/_hypre_parcsr_mv.h
index 63d22a8..4831229 100644
--- a/src/parcsr_mv/_hypre_parcsr_mv.h
+++ b/src/parcsr_mv/_hypre_parcsr_mv.h
@@ -831,6 +831,9 @@ HYPRE_Int hypre_ParCSRMatrixCopy ( hypre_ParCSRMatrix *A , hypre_ParCSRMatrix *B
 HYPRE_Int hypre_FillResponseParToCSRMatrix ( void *p_recv_contact_buf , HYPRE_Int contact_size , HYPRE_Int contact_proc , void *ro , MPI_Comm comm , void **p_send_response_buf , HYPRE_Int *response_message_size );
 hypre_ParCSRMatrix *hypre_ParCSRMatrixCompleteClone ( hypre_ParCSRMatrix *A );
 hypre_ParCSRMatrix *hypre_ParCSRMatrixUnion ( hypre_ParCSRMatrix *A , hypre_ParCSRMatrix *B );
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParCSRMatrixIsManaged(hypre_ParCSRMatrix *a);
+#endif
 
 /* parcsr_matrix.c */
 
@@ -869,6 +872,9 @@ HYPRE_Int hypre_ParVectorPrintIJ ( hypre_ParVector *vector , HYPRE_Int base_j ,
 HYPRE_Int hypre_ParVectorReadIJ ( MPI_Comm comm , const char *filename , HYPRE_Int *base_j_ptr , hypre_ParVector **vector_ptr );
 HYPRE_Int hypre_FillResponseParToVectorAll ( void *p_recv_contact_buf , HYPRE_Int contact_size , HYPRE_Int contact_proc , void *ro , MPI_Comm comm , void **p_send_response_buf , HYPRE_Int *response_message_size );
 HYPRE_Complex hypre_ParVectorLocalSumElts ( hypre_ParVector *vector );
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParVectorIsManaged(hypre_ParVector *vector);
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c
index 6260f21..e7d9dd2 100644
--- a/src/parcsr_mv/par_csr_communication.c
+++ b/src/parcsr_mv/par_csr_communication.c
@@ -71,7 +71,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job,
       case HYPRE_COMM_PKG_JOB_COMPLEX:
          if (!send_data)
          {
-            send_data = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));  
+            send_data = hypre_PinnedTAlloc(HYPRE_Complex, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));
          }
          if (!recv_data)
          {
diff --git a/src/parcsr_mv/par_csr_matrix.c b/src/parcsr_mv/par_csr_matrix.c
index 33ba6a6..3885187 100644
--- a/src/parcsr_mv/par_csr_matrix.c
+++ b/src/parcsr_mv/par_csr_matrix.c
@@ -2153,3 +2153,11 @@ hypre_ParCSRMatrix * hypre_ParCSRMatrixUnion( hypre_ParCSRMatrix * A,
 
    return C;
 }
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParCSRMatrixIsManaged(hypre_ParCSRMatrix *a){
+  if (hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(a)))
+    return ((hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixDiag(a))) && (hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixOffd(a))));
+  else
+    return hypre_CSRMatrixIsManaged(hypre_ParCSRMatrixDiag(a)); 
+}
+#endif
diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c
index 7cb8b80..cf8cb97 100644
--- a/src/parcsr_mv/par_csr_matvec.c
+++ b/src/parcsr_mv/par_csr_matvec.c
@@ -18,6 +18,12 @@
 
 #include "_hypre_parcsr_mv.h"
 #include <assert.h>
+//#ifdef HYPRE_USE_GPU
+//extern "C"
+//{
+//void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, HYPRE_Int *send_map, HYPRE_Int begin,HYPRE_Int end,cudaStream_t s);
+//}
+//#endif
 
 /*--------------------------------------------------------------------------
  * hypre_ParCSRMatrixMatvec
@@ -66,7 +72,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
     *  these conditions terminates processing, and the ierr flag
     *  is informational only.
     *--------------------------------------------------------------------*/
- 
+   PUSH_RANGE_PAYLOAD("PAR_CSR_MATVEC",5,x_size);
    hypre_assert( idxstride>0 );
 
    if (num_cols != x_size)
@@ -102,7 +108,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
 #endif
-
+   PUSH_RANGE("MPI_PACK",3);
    HYPRE_Int use_persistent_comm = 0;
 #ifdef HYPRE_USING_PERSISTENT_COMM
    use_persistent_comm = num_vectors == 1;
@@ -115,6 +121,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
    if ( use_persistent_comm )
    {
 #ifdef HYPRE_USING_PERSISTENT_COMM
+     PUSH_RANGE("PERCOMM1",0);
       persistent_comm_handle = hypre_ParCSRCommPkgGetPersistentCommHandle(1, comm_pkg);
 
       HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
@@ -122,6 +129,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 
       hypre_VectorData(x_tmp) = (HYPRE_Complex *)persistent_comm_handle->recv_data;
       hypre_SeqVectorSetDataOwner(x_tmp, 0);
+      POP_RANGE;
 #endif
    }
    else
@@ -144,6 +152,20 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
    {
       HYPRE_Int begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0);
       HYPRE_Int end   = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
+#ifdef HYPRE_USE_GPU
+      PUSH_RANGE("PERCOMM2DEVICE",4);
+#ifdef HYPRE_USING_PERSISTENT_COMM
+      PackOnDevice((HYPRE_Complex*)persistent_comm_handle->send_data,x_local_data,hypre_ParCSRCommPkgSendMapElmts(comm_pkg),begin,end,HYPRE_STREAM(4));
+      //PrintPointerAttributes(persistent_comm_handle->send_data);
+#else
+      PackOnDevice((HYPRE_Complex*)x_buf_data[0],x_local_data,hypre_ParCSRCommPkgSendMapElmts(comm_pkg),begin,end,HYPRE_STREAM(4));
+#endif
+      POP_RANGE;
+      SetAsyncMode(1);
+      hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0);
+      SetAsyncMode(0);
+      //gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(7)));
+#else
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for HYPRE_SMP_SCHEDULE
 #endif
@@ -156,6 +178,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 #endif
             = x_local_data[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)];
       }
+#endif
    }
    else
       for ( jv=0; jv<num_vectors; ++jv )
@@ -185,7 +208,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
 #endif
-
+   POP_RANGE;
+   PUSH_RANGE("MPI_HALO_EXC_SEND",4);
    if (use_persistent_comm)
    {
 #ifdef HYPRE_USING_PERSISTENT_COMM
@@ -204,13 +228,14 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
 #endif
-
+   POP_RANGE;
+#ifndef HYPRE_USE_GPU
    hypre_CSRMatrixMatvecOutOfPlace( alpha, diag, x_local, beta, b_local, y_local, 0);
-
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] -= hypre_MPI_Wtime();
 #endif
-   
+   PUSH_RANGE("MPI_HALO_EXC_RECV",6);
    if (use_persistent_comm)
    {
 #ifdef HYPRE_USING_PERSISTENT_COMM
@@ -226,7 +251,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
       }
       hypre_TFree(comm_handle);
    }
-
+   POP_RANGE;
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_HALO_EXCHANGE] += hypre_MPI_Wtime();
 #endif
@@ -236,7 +261,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] -= hypre_MPI_Wtime();
 #endif
-
+   PUSH_RANGE("MPI_UNPACK",5);
    hypre_SeqVectorDestroy(x_tmp);
    x_tmp = NULL;
    if (!use_persistent_comm)
@@ -248,7 +273,11 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_PACK_UNPACK] += hypre_MPI_Wtime();
 #endif
-
+   POP_RANGE;
+#ifdef HYPRE_USE_GPU
+   gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+#endif
+   POP_RANGE; // PAR_CSR
    return ierr;
 }
 
diff --git a/src/parcsr_mv/par_vector.c b/src/parcsr_mv/par_vector.c
index d7e8864..01ba038 100644
--- a/src/parcsr_mv/par_vector.c
+++ b/src/parcsr_mv/par_vector.c
@@ -1076,3 +1076,9 @@ HYPRE_Complex hypre_ParVectorLocalSumElts( hypre_ParVector * vector )
 {
    return hypre_VectorSumElts( hypre_ParVectorLocalVector(vector) );
 }
+#ifdef HYPRE_USE_GPU
+hypre_int hypre_ParVectorIsManaged(hypre_ParVector *vector){
+  if (vector==NULL) return 1;
+  return hypre_SeqVectorIsManaged(hypre_ParVectorLocalVector(vector));
+}
+#endif
diff --git a/src/seq_mv/Makefile b/src/seq_mv/Makefile
index abd5795..6982fa0 100644
--- a/src/seq_mv/Makefile
+++ b/src/seq_mv/Makefile
@@ -13,6 +13,9 @@
 
 include ../config/Makefile.config
 
+FILES_NVCC =
+include $(HYPRE_NVCC_MAKEFILE)
+
 CINCLUDES = ${INCLUDES} ${MPIINCLUDE}
 
 C_COMPILE_FLAGS =\
@@ -44,6 +47,7 @@ FILES =\
  vector.c
 
 OBJS = ${FILES:.c=.o}
+CUOBJS = ${FILES_NVCC:.cu=.o}
 
 SONAME = libHYPRE_seq_mv-${HYPRE_RELEASE_VERSION}.so
 
@@ -71,12 +75,12 @@ distclean: clean
 # Rules
 ##################################################################
 
-libHYPRE_seq_mv.a: ${OBJS}
+libHYPRE_seq_mv.a: ${OBJS} ${CUOBJS}
 	@echo  "Building $@ ... "
 	${AR} $@ ${OBJS}
 	${RANLIB} $@
 
-libHYPRE_seq_mv.so: ${OBJS}
+libHYPRE_seq_mv.so: ${OBJS} ${CUOBJS}
 	@echo  "Building $@ ... "
 	${BUILD_CC_SHARED} -o ${SONAME} ${OBJS} ${SHARED_SET_SONAME}${SONAME}
 	ln -s ${SONAME} $@
diff --git a/src/seq_mv/Makefile.empty b/src/seq_mv/Makefile.empty
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/seq_mv/Makefile.empty
@@ -0,0 +1 @@
+
diff --git a/src/seq_mv/Makefile.nvcc b/src/seq_mv/Makefile.nvcc
new file mode 100644
index 0000000..bb42694
--- /dev/null
+++ b/src/seq_mv/Makefile.nvcc
@@ -0,0 +1,4 @@
+
+FILES_NVCC =\
+ gpukernels.cu
+
diff --git a/src/seq_mv/csr_matrix.c b/src/seq_mv/csr_matrix.c
index 3a97f12..aa880c2 100644
--- a/src/seq_mv/csr_matrix.c
+++ b/src/seq_mv/csr_matrix.c
@@ -17,7 +17,9 @@
  *****************************************************************************/
 
 #include "seq_mv.h"
-
+#ifdef HYPRE_USE_GPU
+#include "gpukernels.h"
+#endif
 #ifdef HYPRE_PROFILE
 HYPRE_Real hypre_profile_times[HYPRE_TIMER_ID_COUNT] = { 0 };
 #endif
@@ -33,7 +35,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows,
 {
    hypre_CSRMatrix  *matrix;
 
-   matrix = hypre_CTAlloc(hypre_CSRMatrix, 1);
+   matrix = hypre_HostCTAlloc(hypre_CSRMatrix, 1);
 
    hypre_CSRMatrixData(matrix) = NULL;
    hypre_CSRMatrixI(matrix)    = NULL;
@@ -47,7 +49,9 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows,
    hypre_CSRMatrixOwnsData(matrix) = 1;
    hypre_CSRMatrixNumRownnz(matrix) = num_rows;
 
-
+#ifdef HYPRE_USE_GPU
+   matrix->on_device=0;
+#endif
    return matrix;
 }
 /*--------------------------------------------------------------------------
@@ -72,7 +76,7 @@ hypre_CSRMatrixDestroy( hypre_CSRMatrix *matrix )
          hypre_CSRMatrixData(matrix) = NULL;
          hypre_CSRMatrixJ(matrix)    = NULL;
       }
-      hypre_TFree(matrix);
+      hypre_HostTFree(matrix);
       matrix = NULL;
    }
 
@@ -670,3 +674,36 @@ HYPRE_Int hypre_CSRMatrixGetLoadBalancedPartitionEnd(hypre_CSRMatrix *A)
 {
    return hypre_CSRMatrixGetLoadBalancedPartitionBoundary(A, hypre_GetThreadNum() + 1);
 }
+#ifdef HYPRE_USE_GPU
+void hypre_CSRMatrixPrefetchToDevice(hypre_CSRMatrix *A){
+  if (hypre_CSRMatrixNumNonzeros(A)==0) return;
+
+  PUSH_RANGE_PAYLOAD("hypre_CSRMatrixPrefetchToDevice",0,hypre_CSRMatrixNumNonzeros(A));
+  if ((!A->on_device)&&(hypre_CSRMatrixNumNonzeros(A)>8192)){
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixData(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(4)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixI(A),(hypre_CSRMatrixNumRows(A)+1)*sizeof(HYPRE_Int),HYPRE_DEVICE,HYPRE_STREAM(5)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixJ(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Int),HYPRE_DEVICE,HYPRE_STREAM(6)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(5)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(6)));
+    A->on_device=1;
+  }
+  POP_RANGE;
+}
+void hypre_CSRMatrixPrefetchToHost(hypre_CSRMatrix *A){
+  PUSH_RANGE("hypre_CSRMatrixPrefetchToDevice",0);
+  if (A->on_device){
+    A->on_device=0;
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixData(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Complex),cudaCpuDeviceId,HYPRE_STREAM(4)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixI(A),(hypre_CSRMatrixNumRows(A)+1)*sizeof(HYPRE_Int),cudaCpuDeviceId,HYPRE_STREAM(4)));
+    gpuErrchk(cudaMemPrefetchAsync(hypre_CSRMatrixJ(A),hypre_CSRMatrixNumNonzeros(A)*sizeof(HYPRE_Int),cudaCpuDeviceId,HYPRE_STREAM(4)));
+    gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  }
+  POP_RANGE;
+}
+hypre_int hypre_CSRMatrixIsManaged(hypre_CSRMatrix *a){
+  return ((pointerIsManaged((void*)hypre_CSRMatrixData(a))) 
+	  && (pointerIsManaged((void*)hypre_CSRMatrixI(a)))
+	  && (pointerIsManaged((void*)hypre_CSRMatrixJ(a))));
+}
+#endif
diff --git a/src/seq_mv/csr_matvec.c b/src/seq_mv/csr_matvec.c
index 25c231c..548343a 100644
--- a/src/seq_mv/csr_matvec.c
+++ b/src/seq_mv/csr_matvec.c
@@ -19,6 +19,7 @@
 #include "seq_mv.h"
 #include <assert.h>
 
+
 /*--------------------------------------------------------------------------
  * hypre_CSRMatrixMatvec
  *--------------------------------------------------------------------------*/
@@ -36,7 +37,15 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex    alpha,
 #ifdef HYPRE_PROFILE
    HYPRE_Real time_begin = hypre_MPI_Wtime();
 #endif
-
+#ifdef HYPRE_USE_GPU
+   PUSH_RANGE_PAYLOAD("MATVEC",0, hypre_CSRMatrixNumRows(A));
+   HYPRE_Int ret=hypre_CSRMatrixMatvecDevice( alpha,A,x,beta,b,y,offset);
+   POP_RANGE;
+  return ret;
+#ifdef HYPRE_PROFILE
+   hypre_profile_times[HYPRE_TIMER_ID_MATVEC] += hypre_MPI_Wtime() - time_begin;
+#endif
+#endif
    HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
    HYPRE_Int        *A_i      = hypre_CSRMatrixI(A) + offset;
    HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
@@ -765,3 +774,77 @@ hypre_CSRMatrixMatvec_FF( HYPRE_Complex    alpha,
 
    return ierr;
 }
+#ifdef HYPRE_USE_GPU
+HYPRE_Int
+hypre_CSRMatrixMatvecDevice( HYPRE_Complex    alpha,
+                       hypre_CSRMatrix *A,
+                       hypre_Vector    *x,
+                       HYPRE_Complex    beta,
+		       hypre_Vector    *b,
+		       hypre_Vector    *y,
+		       HYPRE_Int offset )
+{
+
+  static cusparseHandle_t handle;
+  static cusparseMatDescr_t descr;
+  static HYPRE_Int FirstCall=1;
+  cusparseStatus_t status;
+  static cudaStream_t s[10];
+  static HYPRE_Int myid;
+
+  if (b!=y){
+
+    PUSH_RANGE_PAYLOAD("MEMCPY",1,y->size-offset);
+    VecCopy(y->data,b->data,(y->size-offset),HYPRE_STREAM(4));
+    POP_RANGE
+  }
+
+  if (x==y) fprintf(stderr,"ERROR::x and y are the same pointer in hypre_CSRMatrixMatvecDevice\n");
+
+  if (FirstCall){
+    PUSH_RANGE("FIRST_CALL",4);
+
+    handle=getCusparseHandle();
+    
+    status= cusparseCreateMatDescr(&descr); 
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+      printf("ERROR:: Matrix descriptor initialization failed\n");
+      exit(2);
+    } 
+    
+    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+    
+    FirstCall=0;
+    hypre_int jj;
+    for(jj=0;jj<5;jj++)
+      s[jj]=HYPRE_STREAM(jj);
+    nvtxNameCudaStreamA(s[4], "HYPRE_COMPUTE_STREAM");
+    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+    myid++;
+    POP_RANGE;
+  }
+
+  PUSH_RANGE("PREFETCH+SPMV",2);
+
+  hypre_CSRMatrixPrefetchToDevice(A);
+  hypre_SeqVectorPrefetchToDevice(x);
+  hypre_SeqVectorPrefetchToDevice(y);
+  
+  if (offset!=0) printf("WARNING:: Offset is not zero in hypre_CSRMatrixMatvecDevice :: %d \n",offset);
+  cusparseErrchk(cusparseDcsrmv(handle ,
+				CUSPARSE_OPERATION_NON_TRANSPOSE, 
+				A->num_rows-offset, A->num_cols, A->num_nonzeros,
+				&alpha, descr,
+				A->data ,A->i+offset,A->j,
+				x->data, &beta, y->data+offset));
+  
+  if (!GetAsyncMode()){
+  gpuErrchk(cudaStreamSynchronize(s[4]));
+  }
+  POP_RANGE;
+  
+  return 0;
+  
+}
+#endif
diff --git a/src/seq_mv/gpukernels.cu b/src/seq_mv/gpukernels.cu
new file mode 100644
index 0000000..33b7541
--- /dev/null
+++ b/src/seq_mv/gpukernels.cu
@@ -0,0 +1,243 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+//#include <cublas_v2.h>
+#include "_hypre_utilities.h"
+#define gpuErrchk2(ans) { gpuAssert2((ans), __FILE__, __LINE__); }
+inline void gpuAssert2(cudaError_t code, const char *file, hypre_int line)
+{
+   if (code != cudaSuccess) 
+   {
+     printf("GPUassert2: %s %s %d\n", cudaGetErrorString(code), file, line);
+     exit(2);
+   }
+}
+
+
+
+extern "C"{
+  __global__
+  void VecScaleKernelText(HYPRE_Complex *__restrict__ u, const HYPRE_Complex *__restrict__ v, const HYPRE_Complex *__restrict__ l1_norm, hypre_int num_rows){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<num_rows){
+      u[i]+=__ldg(v+i)/__ldg(l1_norm+i);
+    }
+  }
+}
+
+extern "C"{
+  __global__
+  void VecScaleKernel(HYPRE_Complex *__restrict__ u, const HYPRE_Complex *__restrict__ v, const HYPRE_Complex * __restrict__ l1_norm, hypre_int num_rows){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<num_rows){
+      u[i]+=v[i]/l1_norm[i];
+  }
+  }
+}
+
+extern "C"{
+  void VecScale(HYPRE_Complex *u, HYPRE_Complex *v, HYPRE_Complex *l1_norm, hypre_int num_rows,cudaStream_t s){
+    PUSH_RANGE_PAYLOAD("VECSCALE",1,num_rows);
+    const hypre_int tpb=64;
+    hypre_int num_blocks=num_rows/tpb+1;
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    MemPrefetchSized(l1_norm,num_rows*sizeof(HYPRE_Complex),HYPRE_DEVICE,s);
+    VecScaleKernel<<<num_blocks,tpb,0,s>>>(u,v,l1_norm,num_rows);
+#ifdef CATCH_LAUNCH_ERRORS    
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    gpuErrchk2(cudaStreamSynchronize(s));
+    POP_RANGE;
+  }
+}
+
+
+extern "C"{
+
+  __global__
+  void VecCopyKernel(HYPRE_Complex* __restrict__ tgt, const HYPRE_Complex* __restrict__ src, hypre_int size){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<size) tgt[i]=src[i];
+}
+  void VecCopy(HYPRE_Complex* tgt, const HYPRE_Complex* src, hypre_int size,cudaStream_t s){
+    hypre_int tpb=64;
+    hypre_int num_blocks=size/tpb+1;
+    PUSH_RANGE_PAYLOAD("VecCopy",5,size);
+    //MemPrefetch(tgt,0,s);
+    //MemPrefetch(src,0,s);
+    VecCopyKernel<<<num_blocks,tpb,0,s>>>(tgt,src,size);
+    //gpuErrchk2(cudaStreamSynchronize(s));
+    POP_RANGE;
+  }
+}
+extern "C"{
+
+  __global__
+  void VecSetKernel(HYPRE_Complex* __restrict__ tgt, const HYPRE_Complex value,hypre_int size){
+    hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<size) tgt[i]=value;
+}
+  void VecSet(HYPRE_Complex* tgt, hypre_int size, HYPRE_Complex value, cudaStream_t s){
+    hypre_int tpb=64;
+    //cudaDeviceSynchronize();
+    MemPrefetchSized(tgt,size*sizeof(HYPRE_Complex),HYPRE_DEVICE,s);
+    hypre_int num_blocks=size/tpb+1;
+    VecSetKernel<<<num_blocks,tpb,0,s>>>(tgt,value,size);
+    cudaStreamSynchronize(s);
+    //cudaDeviceSynchronize();
+  }
+}
+extern "C"{
+  __global__
+  void  PackOnDeviceKernel(HYPRE_Complex* __restrict__ send_data,const HYPRE_Complex* __restrict__ x_local_data, const hypre_int* __restrict__ send_map, hypre_int begin,hypre_int end){
+    hypre_int i = begin+blockIdx.x * blockDim.x + threadIdx.x;
+    if (i<end){
+      send_data[i-begin]=x_local_data[send_map[i]];
+    }
+  }
+  void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, hypre_int *send_map, hypre_int begin,hypre_int end,cudaStream_t s){
+    if ((end-begin)<=0) return;
+    hypre_int tpb=64;
+    hypre_int num_blocks=(end-begin)/tpb+1;
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    PackOnDeviceKernel<<<num_blocks,tpb,0,s>>>(send_data,x_local_data,send_map,begin,end);
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    PUSH_RANGE("PACK_PREFETCH",1);
+#ifndef HYPRE_GPU_USE_PINNED
+    MemPrefetchSized((void*)send_data,(end-begin)*sizeof(HYPRE_Complex),cudaCpuDeviceId,s);
+#endif
+    POP_RANGE;
+    //gpuErrchk2(cudaStreamSynchronize(s));
+  }
+}
+  
+  // Scale vector by scalar
+
+extern "C"{
+__global__
+void VecScaleScalarKernel(HYPRE_Complex *__restrict__ u, const HYPRE_Complex alpha ,hypre_int num_rows){
+  hypre_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  //if (i<5) printf("DEVICE %d %lf %lf %lf\n",i,u[i],v[i],l1_norm[i]);
+  if (i<num_rows){
+    u[i]*=alpha;
+    //if (i==0) printf("Diff Device %d %lf %lf %lf\n",i,u[i],v[i],l1_norm[i]);
+  }
+}
+}
+extern "C"{
+  hypre_int VecScaleScalar(HYPRE_Complex *u, const HYPRE_Complex alpha,  hypre_int num_rows,cudaStream_t s){
+    PUSH_RANGE("SEQVECSCALE",4);
+    hypre_int num_blocks=num_rows/64+1;
+    
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    VecScaleScalarKernel<<<num_blocks,64,0,s>>>(u,alpha,num_rows);
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+    gpuErrchk2(cudaStreamSynchronize(s));
+    POP_RANGE;
+    return 0;
+  }
+}
+
+
+extern "C"{
+__global__
+void SpMVCudaKernel(HYPRE_Complex* __restrict__ y,HYPRE_Complex alpha, const HYPRE_Complex* __restrict__ A_data, const hypre_int* __restrict__ A_i, const hypre_int* __restrict__ A_j, const HYPRE_Complex* __restrict__ x, HYPRE_Complex beta, hypre_int num_rows)
+{
+  hypre_int i= blockIdx.x * blockDim.x + threadIdx.x;
+  if (i<num_rows){
+    HYPRE_Complex temp = 0.0;
+    hypre_int jj;
+    for (jj = A_i[i]; jj < A_i[i+1]; jj++){
+      hypre_int ajj=A_j[jj];
+      temp += A_data[jj] * x[ajj];
+    }
+    y[i] =y[i]*beta+alpha*temp;
+  }
+}
+
+__global__
+void SpMVCudaKernelZB(HYPRE_Complex* __restrict__ y,HYPRE_Complex alpha, const HYPRE_Complex* __restrict__ A_data, const hypre_int* __restrict__ A_i, const hypre_int* __restrict__ A_j, const HYPRE_Complex* __restrict__ x, hypre_int num_rows)
+{
+  hypre_int i= blockIdx.x * blockDim.x + threadIdx.x;
+  if (i<num_rows){
+    HYPRE_Complex temp = 0.0;
+    hypre_int jj;
+    for (jj = A_i[i]; jj < A_i[i+1]; jj++){
+      hypre_int ajj=A_j[jj];
+      temp += A_data[jj] * x[ajj];
+    }
+    y[i] = alpha*temp;
+  }
+}
+  void SpMVCuda(hypre_int num_rows,HYPRE_Complex alpha, HYPRE_Complex *A_data,hypre_int *A_i, hypre_int *A_j, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y){
+    hypre_int num_threads=64;
+    hypre_int num_blocks=num_rows/num_threads+1;
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif    
+    if (beta==0.0)
+      SpMVCudaKernelZB<<<num_blocks,num_threads>>>(y,alpha,A_data,A_i,A_j,x,num_rows);
+    else
+      SpMVCudaKernel<<<num_blocks,num_threads>>>(y,alpha,A_data,A_i,A_j,x,beta,num_rows);
+#ifdef CATCH_LAUNCH_ERRORS
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+#endif
+
+}
+}
+extern "C"{
+  __global__
+  void CompileFlagSafetyCheck(hypre_int actual){
+#ifdef __CUDA_ARCH__
+    hypre_int cudarch=__CUDA_ARCH__;
+    if (cudarch!=actual){
+      printf("WARNING :: nvcc -arch flag does not match actual device architecture\nWARNING :: The code can fail silently and produce wrong results\n");
+      printf("Arch specified at compile = sm_%d Actual device = sm_%d\n",cudarch/10,actual/10);
+    } 
+#else
+    printf("ERROR:: CUDA_ ARCH is not defined \n This should not be happening\n");
+#endif
+  }
+}
+extern "C"{
+  void CudaCompileFlagCheck(){
+    hypre_int devCount;
+    cudaGetDeviceCount(&devCount);
+    hypre_int i;
+    hypre_int cudarch_actual;
+    for(i = 0; i < devCount; ++i)
+      {
+	struct cudaDeviceProp props;
+	cudaGetDeviceProperties(&props, i);
+	cudarch_actual=props.major*100+props.minor*10;
+    }
+    gpuErrchk2(cudaPeekAtLastError());
+    gpuErrchk2(cudaDeviceSynchronize());
+    CompileFlagSafetyCheck<<<1,1,0,0>>>(cudarch_actual);
+    cudaError_t code=cudaPeekAtLastError();
+    if (code != cudaSuccess)
+      {
+	fprintf(stderr,"ERROR in CudaCompileFlagCheck%s \n", cudaGetErrorString(code));
+	fprintf(stderr,"ERROR :: Check if compile arch flags match actual device arch = sm_%d\n",cudarch_actual/10);
+	exit(2);
+      }
+    gpuErrchk2(cudaDeviceSynchronize());
+  }
+}
diff --git a/src/seq_mv/gpukernels.h b/src/seq_mv/gpukernels.h
new file mode 100644
index 0000000..62d7bae
--- /dev/null
+++ b/src/seq_mv/gpukernels.h
@@ -0,0 +1,9 @@
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+#endif
diff --git a/src/seq_mv/headers b/src/seq_mv/headers
index 1c4c61c..fbf8103 100755
--- a/src/seq_mv/headers
+++ b/src/seq_mv/headers
@@ -49,6 +49,7 @@ cat csr_matrix.h          >> $INTERNAL_HEADER
 cat mapped_matrix.h       >> $INTERNAL_HEADER	
 cat multiblock_matrix.h   >> $INTERNAL_HEADER	
 cat vector.h              >> $INTERNAL_HEADER	
+cat gpukernel.h           >> $INTERNAL_HEADER
 
 ../utilities/protos *.c               >> $INTERNAL_HEADER
 
diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h
index d737b9a..a851d4d 100644
--- a/src/seq_mv/seq_mv.h
+++ b/src/seq_mv/seq_mv.h
@@ -59,6 +59,11 @@ typedef struct
    HYPRE_Int     *rownnz;
    HYPRE_Int      num_rownnz;
 
+#ifdef HYPRE_USE_GPU
+  /* Flag to keeping track of prefetching */
+  HYPRE_Int on_device;
+#endif
+
 } hypre_CSRMatrix;
 
 /*--------------------------------------------------------------------------
@@ -208,6 +213,9 @@ typedef struct
       With rowwise storage, vj[i] = data[ j + num_vectors*i] */
    HYPRE_Int  vecstride, idxstride;
    /* ... so vj[i] = data[ j*vecstride + i*idxstride ] regardless of row_storage.*/
+#ifdef HYPRE_USE_GPU
+  HYPRE_Int on_device;
+#endif
 
 } hypre_Vector;
 
@@ -225,6 +233,20 @@ typedef struct
 
 #endif
 
+#ifndef hypre_GPUKERNELS_HEADER
+#define hypre_GPUKERNELS_HEADER
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+void PackOnDevice(HYPRE_Complex *send_data,HYPRE_Complex *x_local_data, hypre_int *send_map, hypre_int begin,hypre_int end,cudaStream_t s);
+#endif
+#endif
+
 /* csr_matop.c */
 hypre_CSRMatrix *hypre_CSRMatrixAdd ( hypre_CSRMatrix *A , hypre_CSRMatrix *B );
 hypre_CSRMatrix *hypre_CSRMatrixMultiply ( hypre_CSRMatrix *A , hypre_CSRMatrix *B );
@@ -245,6 +267,11 @@ HYPRE_Int hypre_CSRMatrixPrintHB ( hypre_CSRMatrix *matrix_input , char *file_na
 HYPRE_Int hypre_CSRMatrixCopy ( hypre_CSRMatrix *A , hypre_CSRMatrix *B , HYPRE_Int copy_data );
 hypre_CSRMatrix *hypre_CSRMatrixClone ( hypre_CSRMatrix *A );
 hypre_CSRMatrix *hypre_CSRMatrixUnion ( hypre_CSRMatrix *A , hypre_CSRMatrix *B , HYPRE_Int *col_map_offd_A , HYPRE_Int *col_map_offd_B , HYPRE_Int **col_map_offd_C );
+#ifdef HYPRE_USE_GPU
+void hypre_CSRMatrixPrefetchToDevice(hypre_CSRMatrix *A);
+void hypre_CSRMatrixPrefetchToHost(hypre_CSRMatrix *A);
+hypre_int hypre_CSRMatrixIsManaged(hypre_CSRMatrix *a);
+#endif
 
 /* csr_matvec.c */
 // y[offset:end] = alpha*A[offset:end,:]*x + beta*b[offset:end]
@@ -253,7 +280,9 @@ HYPRE_Int hypre_CSRMatrixMatvecOutOfPlace ( HYPRE_Complex alpha , hypre_CSRMatri
 HYPRE_Int hypre_CSRMatrixMatvec ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y );
 HYPRE_Int hypre_CSRMatrixMatvecT ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y );
 HYPRE_Int hypre_CSRMatrixMatvec_FF ( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y , HYPRE_Int *CF_marker_x , HYPRE_Int *CF_marker_y , HYPRE_Int fpt );
-
+#ifdef HYPRE_USE_GPU
+HYPRE_Int hypre_CSRMatrixMatvecDevice( HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *b, hypre_Vector *y, HYPRE_Int offset );
+#endif
 /* genpart.c */
 HYPRE_Int hypre_GeneratePartitioning ( HYPRE_Int length , HYPRE_Int num_procs , HYPRE_Int **part_ptr );
 HYPRE_Int hypre_GenerateLocalPartitioning ( HYPRE_Int length , HYPRE_Int num_procs , HYPRE_Int myid , HYPRE_Int **part_ptr );
@@ -337,7 +366,16 @@ HYPRE_Int hypre_SeqVectorScale ( HYPRE_Complex alpha , hypre_Vector *y );
 HYPRE_Int hypre_SeqVectorAxpy ( HYPRE_Complex alpha , hypre_Vector *x , hypre_Vector *y );
 HYPRE_Real hypre_SeqVectorInnerProd ( hypre_Vector *x , hypre_Vector *y );
 HYPRE_Complex hypre_VectorSumElts ( hypre_Vector *vector );
-
+#ifdef HYPRE_USE_GPU
+HYPRE_Complex hypre_VectorSumAbsElts ( hypre_Vector *vector );
+HYPRE_Int hypre_SeqVectorCopyDevice ( hypre_Vector *x , hypre_Vector *y );
+HYPRE_Int hypre_SeqVectorAxpyDevice( HYPRE_Complex alpha , hypre_Vector *x , hypre_Vector *y );
+HYPRE_Real hypre_SeqVectorInnerProdDevice ( hypre_Vector *x , hypre_Vector *y );
+void hypre_SeqVectorPrefetchToDevice(hypre_Vector *x);
+void hypre_SeqVectorPrefetchToHost(hypre_Vector *x);
+void hypre_SeqVectorPrefetchToDeviceInStream(hypre_Vector *x, HYPRE_Int index);
+hypre_int hypre_SeqVectorIsManaged(hypre_Vector *x);
+#endif
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c
index 619ddba..c573682 100644
--- a/src/seq_mv/vector.c
+++ b/src/seq_mv/vector.c
@@ -18,6 +18,11 @@
 
 #include "seq_mv.h"
 #include <assert.h>
+#ifdef HYPRE_USE_GPU
+#include <cublas_v2.h>
+#include <cusparse.h>
+#include "gpukernels.h"
+#endif
 
 /*--------------------------------------------------------------------------
  * hypre_SeqVectorCreate
@@ -28,7 +33,11 @@ hypre_SeqVectorCreate( HYPRE_Int size )
 {
    hypre_Vector  *vector;
 
-   vector = hypre_CTAlloc(hypre_Vector, 1);
+   vector = hypre_HostCTAlloc(hypre_Vector, 1);
+
+#ifdef HYPRE_USE_GPU
+   vector->on_device=0;
+#endif
 
    hypre_VectorData(vector) = NULL;
    hypre_VectorSize(vector) = size;
@@ -69,7 +78,7 @@ hypre_SeqVectorDestroy( hypre_Vector *vector )
       {
          hypre_TFree(hypre_VectorData(vector));
       }
-      hypre_TFree(vector);
+      hypre_HostTFree(vector);
    }
 
    return ierr;
@@ -246,6 +255,10 @@ HYPRE_Int
 hypre_SeqVectorSetConstantValues( hypre_Vector *v,
                                   HYPRE_Complex value )
 {
+#ifdef HYPRE_USE_GPU
+  VecSet(hypre_VectorData(v),hypre_VectorSize(v),value,HYPRE_STREAM(4));
+  return 0;
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -310,6 +323,9 @@ HYPRE_Int
 hypre_SeqVectorCopy( hypre_Vector *x,
                      hypre_Vector *y )
 {
+#ifdef HYPRE_USE_GPU
+  return hypre_SeqVectorCopyDevice(x,y);
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -394,7 +410,10 @@ hypre_SeqVectorScale( HYPRE_Complex alpha,
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
-
+   
+#ifdef HYPRE_USE_GPU
+   return VecScaleScalar(y->data,alpha, hypre_VectorSize(y),HYPRE_STREAM(4));
+#endif
    HYPRE_Complex *y_data = hypre_VectorData(y);
    HYPRE_Int      size   = hypre_VectorSize(y);
            
@@ -426,6 +445,9 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha,
                      hypre_Vector *x,
                      hypre_Vector *y     )
 {
+#ifdef  HYPRE_USE_GPU
+  return hypre_SeqVectorAxpyDevice(alpha,x,y);
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -460,6 +482,9 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha,
 HYPRE_Real   hypre_SeqVectorInnerProd( hypre_Vector *x,
                                        hypre_Vector *y )
 {
+#ifdef HYPRE_USE_GPU
+  return hypre_SeqVectorInnerProdDevice(x,y);
+#endif
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] -= hypre_MPI_Wtime();
 #endif
@@ -506,3 +531,133 @@ HYPRE_Complex hypre_VectorSumElts( hypre_Vector *vector )
 
    return sum;
 }
+
+#ifdef HYPRE_USE_GPU
+/* Sums of the absolute value of the elements for comparison to cublas device side routine */
+HYPRE_Complex hypre_VectorSumAbsElts( hypre_Vector *vector )
+{
+   HYPRE_Complex  sum = 0;
+   HYPRE_Complex *data = hypre_VectorData( vector );
+   HYPRE_Int      size = hypre_VectorSize( vector );
+   HYPRE_Int      i;
+
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(i) reduction(+:sum) HYPRE_SMP_SCHEDULE
+#endif
+   for ( i=0; i<size; ++i ) sum += fabs(data[i]); 
+
+   return sum;
+}
+HYPRE_Int
+hypre_SeqVectorCopyDevice( hypre_Vector *x,
+                     hypre_Vector *y )
+{
+  
+  HYPRE_Complex *x_data = hypre_VectorData(x);
+  HYPRE_Complex *y_data = hypre_VectorData(y);
+  HYPRE_Int      size   = hypre_VectorSize(x);
+  HYPRE_Int      size_y   = hypre_VectorSize(y);
+  
+  HYPRE_Int      i;
+  
+  HYPRE_Int      ierr = 0;
+  
+  if (size > size_y) size = size_y;
+  size *=hypre_VectorNumVectors(x);
+  PUSH_RANGE_PAYLOAD("VECCOPYDEVICE",2,size);
+  hypre_SeqVectorPrefetchToDevice(x);
+  hypre_SeqVectorPrefetchToDevice(y);
+  VecCopy(y_data,x_data,size,HYPRE_STREAM(4));
+  cudaStreamSynchronize(HYPRE_STREAM(4));
+  POP_RANGE;
+  return ierr;
+}
+HYPRE_Int
+hypre_SeqVectorAxpyDevice( HYPRE_Complex alpha,
+                     hypre_Vector *x,
+		     hypre_Vector *y     ){
+
+  HYPRE_Complex *x_data = hypre_VectorData(x);
+  HYPRE_Complex *y_data = hypre_VectorData(y);
+  HYPRE_Int      size   = hypre_VectorSize(x);
+           
+  HYPRE_Int      i;
+           
+  HYPRE_Int      ierr = 0;
+  cublasStatus_t stat;
+  size *=hypre_VectorNumVectors(x);
+
+  PUSH_RANGE_PAYLOAD("DEVAXPY",0,hypre_VectorSize(x));
+  hypre_SeqVectorPrefetchToDevice(x);
+  hypre_SeqVectorPrefetchToDevice(y);
+  static cublasHandle_t handle;
+  static HYPRE_Int firstcall=1;
+  if (firstcall){
+    handle=getCublasHandle();
+    firstcall=0;
+  }
+  cublasErrchk(cublasDaxpy(handle,(HYPRE_Int)size,&alpha,x_data,1,y_data,1));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+  return ierr;
+}
+
+HYPRE_Real   hypre_SeqVectorInnerProdDevice( hypre_Vector *x,
+                                       hypre_Vector *y )
+{
+  PUSH_RANGE_PAYLOAD("DEVDOT",4,hypre_VectorSize(x));
+  static cublasHandle_t handle;
+  static HYPRE_Int firstcall=1;
+
+  HYPRE_Complex *x_data = hypre_VectorData(x);
+  HYPRE_Complex *y_data = hypre_VectorData(y);
+  HYPRE_Int      size   = hypre_VectorSize(x);
+           
+  HYPRE_Int      i;
+
+  HYPRE_Real     result = 0.0;
+  cublasStatus_t stat;
+  if (firstcall){
+    handle = getCublasHandle();
+    firstcall=0;
+  }
+  PUSH_RANGE_PAYLOAD("DEVDOT-PRFETCH",5,hypre_VectorSize(x));
+  //hypre_SeqVectorPrefetchToDevice(x);
+  //hypre_SeqVectorPrefetchToDevice(y);
+  POP_RANGE;
+  PUSH_RANGE_PAYLOAD("DEVDOT-ACTUAL",0,hypre_VectorSize(x));
+  stat=cublasDdot(handle, (HYPRE_Int)size,
+		  x_data, 1,
+		  y_data, 1,
+		  &result);
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+  POP_RANGE;
+  return result;
+  
+}
+void hypre_SeqVectorPrefetchToDevice(hypre_Vector *x){
+  if (hypre_VectorSize(x)==0) return;
+  PUSH_RANGE("hypre_SeqVectorPrefetchToDevice",0);
+  gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(4)));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+}
+void hypre_SeqVectorPrefetchToHost(hypre_Vector *x){
+  if (hypre_VectorSize(x)==0) return;
+  PUSH_RANGE("hypre_SeqVectorPrefetchToHost",0);
+  gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),cudaCpuDeviceId,HYPRE_STREAM(4)));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(4)));
+  POP_RANGE;
+}
+void hypre_SeqVectorPrefetchToDeviceInStream(hypre_Vector *x, HYPRE_Int index){
+  if (hypre_VectorSize(x)==0) return;
+  PUSH_RANGE("hypre_SeqVectorPrefetchToDevice",0);
+  gpuErrchk(cudaMemPrefetchAsync(hypre_VectorData(x),hypre_VectorSize(x)*sizeof(HYPRE_Complex),HYPRE_DEVICE,HYPRE_STREAM(index)));
+  gpuErrchk(cudaStreamSynchronize(HYPRE_STREAM(index)));
+  POP_RANGE;
+}
+hypre_int hypre_SeqVectorIsManaged(hypre_Vector *x){
+  return pointerIsManaged((void*)hypre_VectorData(x));
+}
+#endif
diff --git a/src/sstruct_ls/HYPRE_sstruct_int.c b/src/sstruct_ls/HYPRE_sstruct_int.c
index 72fafe0..dd2b54d 100644
--- a/src/sstruct_ls/HYPRE_sstruct_int.c
+++ b/src/sstruct_ls/HYPRE_sstruct_int.c
@@ -27,12 +27,12 @@ hypre_SStructPVectorSetRandomValues( hypre_SStructPVector *pvector, HYPRE_Int se
    hypre_StructVector *svector;
    HYPRE_Int           var;
 
-   srand( seed );
+   hypre_SeedRand( seed );
 
    for (var = 0; var < nvars; var++)
    {
       svector = hypre_SStructPVectorSVector(pvector, var);
-	  seed = rand();
+      seed = hypre_RandI();
       hypre_StructVectorSetRandomValues(svector, seed);
    }
 
@@ -47,12 +47,12 @@ hypre_SStructVectorSetRandomValues( hypre_SStructVector *vector, HYPRE_Int seed
    hypre_SStructPVector *pvector;
    HYPRE_Int             part;
 
-   srand( seed );
+   hypre_SeedRand( seed );
 
    for (part = 0; part < nparts; part++)
    {
       pvector = hypre_SStructVectorPVector(vector, part);
-	  seed = rand();
+      seed = hypre_RandI();
       hypre_SStructPVectorSetRandomValues(pvector, seed);
    }
 
diff --git a/src/sstruct_ls/fac_amr_fcoarsen.c b/src/sstruct_ls/fac_amr_fcoarsen.c
index 1f0299c..fd521d2 100644
--- a/src/sstruct_ls/fac_amr_fcoarsen.c
+++ b/src/sstruct_ls/fac_amr_fcoarsen.c
@@ -53,13 +53,13 @@
    }
 
 
-#define AbsStencilShape(stencil, abs_shape)     \
-   {                                            \
-      HYPRE_Int ii,jj,kk;                       \
-      ii = hypre_IndexX(stencil);               \
-      jj = hypre_IndexY(stencil);               \
-      kk = hypre_IndexZ(stencil);               \
-      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk);   \
+#define AbsStencilShape(stencil, abs_shape)                     \
+   {                                                            \
+      HYPRE_Int ii,jj,kk;                                       \
+      ii = hypre_IndexX(stencil);                               \
+      jj = hypre_IndexY(stencil);                               \
+      kk = hypre_IndexZ(stencil);                               \
+      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \
    }
 
 /*--------------------------------------------------------------------------
@@ -157,7 +157,7 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
 
    HYPRE_Int               i, j, k, l, m, n, ll, kk, jj;
    HYPRE_Int               nvars, var1, var2, var2_start; 
-   HYPRE_Int               iA, iAc, iA_shift_z, iA_shift_zy, iA_shift_zyx;
+   HYPRE_Int               iA_shift_z, iA_shift_zy, iA_shift_zyx;
 
    hypre_Index             lindex;
    hypre_Index             index1, index2;
@@ -1868,19 +1868,17 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
                   /*----------------------------------------------------------------
                    * Loop over interior grid box. 
                    *----------------------------------------------------------------*/
+
                   hypre_BoxGetSize(&fine_box, loop_size);
 
-                  hypre_BoxLoop2Begin(ndim, loop_size,
-                                      A_dbox, fstart, stridef, iA,
-                                      crse_dbox, cstart, stridec, iAc);
+                  hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                            A_dbox, fstart, stridef, iA,
+                                            crse_dbox, cstart, stridec, iAc);
 #if 0
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,i,rank,index1,index2,m,l,k,j,iA_shift_z,iA_shift_zy,iA_shift_zyx,stencil_i,sum,vals) HYPRE_SMP_SCHEDULE
 #endif
-#else
-                  hypre_BoxLoopSetOneBlock();
 #endif
-                  hypre_BoxLoop2For(iA, iAc)
                   {
                      for (i= 0; i< stencil_size; i++)
                      {
@@ -1986,7 +1984,7 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
                      }
                      crse_ptrs[ rank_stencils[0] ][iAc]= sum;
                   }
-                  hypre_BoxLoop2End(iA, iAc);
+                  hypre_SerialBoxLoop2End(iA, iAc);
                }    /* end hypre_ForBoxI(fi, fbox_interior_ci) */
 
                /*------------------------------------------------------------------
@@ -2050,19 +2048,12 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
                      /*--------------------------------------------------------------
                       * Loop over boundary grid box.
                       *--------------------------------------------------------------*/
+
                      hypre_BoxGetSize(&fine_box, loop_size);
 
-                     hypre_BoxLoop2Begin(ndim, loop_size,
-                                         A_dbox, fstart, stridef, iA,
-                                         crse_dbox, cstart, stridec, iAc);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,i,rank,index1,index2,m,l,k,j,iA_shift_z,iA_shift_zy,iA_shift_zyx,stencil_i,temp3,ll,kk,jj,temp2,cnt1,index_temp,boxman_entry,found,Uventry,nUentries,ncols,rows,cols,vals2,sum,vals) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop2For(iA, iAc)
+                     hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                               A_dbox, fstart, stridef, iA,
+                                               crse_dbox, cstart, stridec, iAc);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         for (i= 0; i< stencil_size; i++)
@@ -2338,7 +2329,7 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
                         hypre_TFree(temp3);
 
                      }
-                     hypre_BoxLoop2End(iA, iAc);
+                     hypre_SerialBoxLoop2End(iA, iAc);
 
                   }  /* hypre_ForBoxI(fi, fbox_bdy_ci_fi) */
                }      /* hypre_ForBoxArrayI(arrayi, fbox_bdy_ci) */
@@ -2940,7 +2931,7 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
                   }
 
                   hypre_qsort1(interface_stencil_ranks[i], (HYPRE_Real *) temp1, 0,
-                         coarse_stencil_cnt[i]-1);
+                               coarse_stencil_cnt[i]-1);
 
                   /*---------------------------------------------------------------
                    * swap the stencil_vals to agree with the rank swapping.
@@ -3494,6 +3485,7 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
 #endif
                hypre_BoxLoop1For(iA)
                {
+                  HYPRE_Int i;
                   for (i= 0; i< stencil_size; i++)
                   {
                      if (i != centre)
diff --git a/src/sstruct_ls/fac_amr_rap.c b/src/sstruct_ls/fac_amr_rap.c
index 378b17d..c9d03ec 100644
--- a/src/sstruct_ls/fac_amr_rap.c
+++ b/src/sstruct_ls/fac_amr_rap.c
@@ -86,7 +86,6 @@ hypre_AMR_RAP( hypre_SStructMatrix  *A,
    hypre_Index                  index, stride, zero_index;
    HYPRE_Int                    nvars, var1, var2, part, cbox;
    HYPRE_Int                    i, j, k, size;
-   HYPRE_Int                    iA, iAc;
 
    HYPRE_Int                    myid;
    HYPRE_Int                    ierr= 0;
@@ -220,7 +219,7 @@ hypre_AMR_RAP( hypre_SStructMatrix  *A,
                                       smatrix_dbox, ilower, stride, iA,
                                       fac_smatrix_dbox, ilower, stride, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(iA, iAc)
                   {
@@ -277,7 +276,7 @@ hypre_AMR_RAP( hypre_SStructMatrix  *A,
                                       smatrix_dbox, ilower, stride, iA,
                                       fac_smatrix_dbox, ilower, stride, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(iA, iAc)
                   {
@@ -415,7 +414,7 @@ hypre_AMR_RAP( hypre_SStructMatrix  *A,
                                       smatrix_dbox, ilower, stride, iA,
                                       fac_smatrix_dbox, ilower, stride, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(iA, iAc)
                   {
diff --git a/src/sstruct_ls/fac_cf_coarsen.c b/src/sstruct_ls/fac_cf_coarsen.c
index 21ef981..f36d8d6 100644
--- a/src/sstruct_ls/fac_cf_coarsen.c
+++ b/src/sstruct_ls/fac_cf_coarsen.c
@@ -48,17 +48,17 @@
          jj= -1;                                \
       if (kk==2)                                \
          kk= -1;                                \
-      hypre_SetIndex3(stencil, ii, jj, kk);      \
+      hypre_SetIndex3(stencil, ii, jj, kk);     \
    }
 
 
-#define AbsStencilShape(stencil, abs_shape)     \
-   {                                            \
-      HYPRE_Int ii,jj,kk;                       \
-      ii = hypre_IndexX(stencil);               \
-      jj = hypre_IndexY(stencil);               \
-      kk = hypre_IndexZ(stencil);               \
-      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk);   \
+#define AbsStencilShape(stencil, abs_shape)                     \
+   {                                                            \
+      HYPRE_Int ii,jj,kk;                                       \
+      ii = hypre_IndexX(stencil);                               \
+      jj = hypre_IndexY(stencil);                               \
+      kk = hypre_IndexZ(stencil);                               \
+      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \
    }
 
 /*--------------------------------------------------------------------------
@@ -130,7 +130,7 @@ hypre_AMR_CFCoarsen( hypre_SStructMatrix  *   A,
    HYPRE_Int               rank, startrank;
    HYPRE_Real             *vals;
 
-   HYPRE_Int               i, j, iA;
+   HYPRE_Int               i, j;
    HYPRE_Int               nvars, var1; 
 
    hypre_Index             lindex, zero_index;
@@ -219,7 +219,7 @@ hypre_AMR_CFCoarsen( hypre_SStructMatrix  *   A,
          hypre_StructMapCoarseToFine(hypre_BoxIMin(cgrid_box), zero_index,
                                      refine_factors, hypre_BoxIMin(&refined_box));
          hypre_SetIndex3(index1, refine_factors[0]-1, refine_factors[1]-1,
-                        refine_factors[2]-1);
+                         refine_factors[2]-1);
          hypre_StructMapCoarseToFine(hypre_BoxIMax(cgrid_box), index1,
                                      refine_factors, hypre_BoxIMax(&refined_box));
 
@@ -340,17 +340,9 @@ hypre_AMR_CFCoarsen( hypre_SStructMatrix  *   A,
                   fgrid_cinterface= hypre_BoxArrayBox(cinterface_array, boxi);
                   hypre_CopyIndex(hypre_BoxIMin(fgrid_cinterface), node_extents);
                   hypre_BoxGetSize(fgrid_cinterface, loop_size);
-                    
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      A_dbox, node_extents, stridec, iA);
-#if 0 /* Are private static arrays a problem? */
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,lindex,i,index_temp,boxman_entry,rank,found,Uventry,nUentries,temp1,cnt1,ncols,rows,cols,temp2,vals,index2,index1,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(iA)
+
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            A_dbox, node_extents, stridec, iA);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      for (i= 0; i< stencil_size; i++)
@@ -482,7 +474,7 @@ hypre_AMR_CFCoarsen( hypre_SStructMatrix  *   A,
                         }   /* if (Uventry != NULL) */
                      }       /* if (nUventries > 0) */
                   }
-                  hypre_BoxLoop1End(iA);
+                  hypre_SerialBoxLoop1End(iA);
                }  /* for (boxi= stencil_size; boxi< box_array_size; boxi++) */
             }     /* hypre_ForBoxArrayI(fi, cinterface_arrays) */
          }        /* hypre_ForBoxI(ci, cgrid_boxes) */
diff --git a/src/sstruct_ls/fac_interp2.c b/src/sstruct_ls/fac_interp2.c
index 2a0306b..31cc64b 100644
--- a/src/sstruct_ls/fac_interp2.c
+++ b/src/sstruct_ls/fac_interp2.c
@@ -739,9 +739,6 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
    hypre_StructVector     *e_var;
    hypre_StructVector     *recv_var;
 
-   HYPRE_Int               xci;
-   HYPRE_Int               ei;
-
    HYPRE_Real           ***xcp;
    HYPRE_Real           ***ep;
 
@@ -967,17 +964,9 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
             hypre_CopyIndex(hypre_BoxIMin(ownbox), startc);
             hypre_BoxGetSize(ownbox, loop_size);
 
-            hypre_BoxLoop2Begin(ndim, loop_size,
-                                e_dbox,  start,  stride,  ei,
-                                xc_dbox, startc, stridec, xci);
-#if 1
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci,lindex,imax,jmax,kmax,k,offset_kp1,zweight2,kshift,zweight1,j,offset_jp1,yweight2,jshift,yweight1,i,offset_ip1,xweight2,ishift,xweight1) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop2For(ei, xci)
+            hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                      e_dbox,  start,  stride,  ei,
+                                      xc_dbox, startc, stridec, xci);
             {
                /*--------------------------------------------------------
                 * Linear interpolation. Determine the weights and the
@@ -1191,7 +1180,7 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
                   }         /* for (j= 0; j< jmax; j++) */
                }            /* for (k= 0; k< kmax; k++) */ 
             }
-            hypre_BoxLoop2End(ei, xci);
+            hypre_SerialBoxLoop2End(ei, xci);
 
          }/* hypre_ForBoxI(bi, own_abox) */
       }   /* hypre_ForBoxArray(fi, fgrid_boxes) */
@@ -1297,7 +1286,7 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
                for (j=0; j< jsize; j++)
                {
                   hypre_SetIndex3(temp_index2, 
-                                 ptr_ishift, j+ptr_jshift, k+ptr_kshift);
+                                  ptr_ishift, j+ptr_jshift, k+ptr_kshift);
                   xcp[k][j]= hypre_StructVectorBoxData(recv_var, bi) +
                      hypre_BoxOffsetDistance(xc_dbox, temp_index2);
                }
@@ -1306,17 +1295,9 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
             hypre_CopyIndex(hypre_BoxIMin(ownbox), startc);
             hypre_BoxGetSize(ownbox, loop_size);
 
-            hypre_BoxLoop2Begin(ndim, loop_size,
-                                e_dbox,  start,  stride,  ei,
-                                xc_dbox, startc, stridec, xci);
-#if 1
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci,lindex,imax,jmax,kmax,k,offset_kp1,zweight2,kshift,zweight1,j,offset_jp1,yweight2,jshift,yweight1,i,offset_ip1,xweight2,ishift,xweight1) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop2For(ei, xci)
+            hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                      e_dbox,  start,  stride,  ei,
+                                      xc_dbox, startc, stridec, xci);
             {
                /*--------------------------------------------------------
                 * Linear interpolation. Determine the weights and the
@@ -1533,7 +1514,7 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
                   }         /* for (j= 0; j< jmax; j++) */
                }            /* for (k= 0; k< kmax; k++) */
             }
-            hypre_BoxLoop2End(ei, xci);
+            hypre_SerialBoxLoop2End(ei, xci);
 
          }  /* if (hypre_BoxVolume(ownbox)) */
       }     /* hypre_ForBoxI(bi, own_abox) */
diff --git a/src/sstruct_ls/fac_restrict2.c b/src/sstruct_ls/fac_restrict2.c
index 213c8ed..3943f50 100644
--- a/src/sstruct_ls/fac_restrict2.c
+++ b/src/sstruct_ls/fac_restrict2.c
@@ -38,7 +38,7 @@
       ii = (ij%2);                              \
       jj = (ij-ii)/2;                           \
       kk = (rank-2*jj-ii)/4;                    \
-      hypre_SetIndex3(stencil, ii, jj, kk);      \
+      hypre_SetIndex3(stencil, ii, jj, kk);     \
    }
 
 /*--------------------------------------------------------------------------
@@ -60,8 +60,8 @@ typedef struct
 
    hypre_CommPkg       **interlevel_comm;
 /*   hypre_CommPkg       **intralevel_comm;*/ /* may need to build an intra comm so
-                                                 that each processor only fullwts its
-                                                 own fine data- may need to add contrib */
+     that each processor only fullwts its
+     own fine data- may need to add contrib */
 
 } hypre_FacSemiRestrictData2;
 
@@ -518,9 +518,6 @@ hypre_FACRestrict2( void                 *  fac_restrict_vdata,
    hypre_StructVector     *xc_var;
    hypre_StructVector     *xf_var;
 
-   HYPRE_Int               xci;
-   HYPRE_Int               xfi;
-
    HYPRE_Real           ***xfp;
    HYPRE_Real           ***xcp;
    HYPRE_Real           ***xcp_temp;
@@ -730,17 +727,9 @@ hypre_FACRestrict2( void                 *  fac_restrict_vdata,
          hypre_BoxGetSize(fgrid_box, temp_index1);
          hypre_StructMapFineToCoarse(temp_index1, temp_index2, rfactors, loop_size);
 
-         hypre_BoxLoop2Begin(ndim, loop_size,
-                             xf_dbox, start, stride,  xfi,
-                             xc_temp_dbox, startc, stridec, xci);
-#if 0 /* Are private static arrays a problem? */
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xfi,xci,imax,jmax,kmax,k,kcell,j,jcell,i,icell,ijkcell,temp_index2) HYPRE_SMP_SCHEDULE
-#endif
-#else
-         hypre_BoxLoopSetOneBlock();
-#endif
-         hypre_BoxLoop2For(xfi, xci)
+         hypre_SerialBoxLoop2Begin(ndim, loop_size,
+                                   xf_dbox, start, stride,  xfi,
+                                   xc_temp_dbox, startc, stridec, xci);
          {
             /*-----------------------------------------------------------------
              * Arithmetic average the refinement patch values to get 
@@ -804,7 +793,7 @@ hypre_FACRestrict2( void                 *  fac_restrict_vdata,
             }
 
          }
-         hypre_BoxLoop2End(xfi, xci);
+         hypre_SerialBoxLoop2End(xfi, xci);
 
       }   /* hypre_ForBoxI(fi, fgrid_boxes) */
    }      /* for (var= 0; var< nvars; var++)*/
diff --git a/src/sstruct_ls/fac_setup2.c b/src/sstruct_ls/fac_setup2.c
index 1427516..cbf26aa 100644
--- a/src/sstruct_ls/fac_setup2.c
+++ b/src/sstruct_ls/fac_setup2.c
@@ -25,7 +25,7 @@ hypre_FacSetup2( void                 *fac_vdata,
                  hypre_SStructVector  *b,
                  hypre_SStructVector  *x )
 {
-	hypre_FACData          *fac_data      =  (hypre_FACData*)fac_vdata;
+   hypre_FACData          *fac_data      =  (hypre_FACData*)fac_vdata;
 
    HYPRE_Int              *plevels       = (fac_data-> plevels);
    hypre_Index            *rfactors      = (fac_data-> prefinements);
@@ -105,7 +105,6 @@ hypre_FacSetup2( void                 *fac_vdata,
    HYPRE_Int              *stencil_vars;
    HYPRE_Real             *values;
    HYPRE_Real             *A_smatrix_value;
-   HYPRE_Int               iA;
  
    HYPRE_Int              *nrows;
    HYPRE_Int             **ncols;
@@ -126,8 +125,8 @@ hypre_FacSetup2( void                 *fac_vdata,
    HYPRE_Int               ierr = 0;
 /*hypre_SStructMatrix *nested_A;
 
-nested_A= hypre_TAlloc(hypre_SStructMatrix , 1);
-nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/
+  nested_A= hypre_TAlloc(hypre_SStructMatrix , 1);
+  nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/
 
    /* generate the composite operator with the computed coarse-grid operators */
    hypre_AMR_RAP(A_in, rfactors, &A_rap);
@@ -498,7 +497,7 @@ nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/
                                    sgrid_box, box_start, stride, k,
                                    A_smatrix_dbox, box_start, stride, iA);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,k,iA) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(k, iA)
                {
@@ -573,7 +572,7 @@ nested_A= hypre_CoarsenAMROp(fac_vdata, A);*/
                                       sgrid_box, box_start, stride, k,
                                       A_smatrix_dbox, box_start, stride, iA);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,k,iA) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(k, iA)
                   {
diff --git a/src/sstruct_ls/fac_zero_stencilcoef.c b/src/sstruct_ls/fac_zero_stencilcoef.c
index a46d613..b49d586 100644
--- a/src/sstruct_ls/fac_zero_stencilcoef.c
+++ b/src/sstruct_ls/fac_zero_stencilcoef.c
@@ -13,13 +13,13 @@
 #include "_hypre_sstruct_ls.h"
 #include "fac.h"
 
-#define AbsStencilShape(stencil, abs_shape)     \
-   {                                            \
-      HYPRE_Int ii,jj,kk;                       \
-      ii = hypre_IndexX(stencil);               \
-      jj = hypre_IndexY(stencil);               \
-      kk = hypre_IndexZ(stencil);               \
-      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk);   \
+#define AbsStencilShape(stencil, abs_shape)                     \
+   {                                                            \
+      HYPRE_Int ii,jj,kk;                                       \
+      ii = hypre_IndexX(stencil);                               \
+      jj = hypre_IndexY(stencil);                               \
+      kk = hypre_IndexZ(stencil);                               \
+      abs_shape= hypre_abs(ii) + hypre_abs(jj) + hypre_abs(kk); \
    }
 
 /*--------------------------------------------------------------------------
@@ -71,7 +71,6 @@ hypre_FacZeroCFSten( hypre_SStructPMatrix *Af,
    HYPRE_Real            *ac_ptr;
    hypre_Index            loop_size;
 
-   HYPRE_Int              iac;
    HYPRE_Int              ci, i, j;
 
    HYPRE_Int              abs_shape;
@@ -176,7 +175,7 @@ hypre_FacZeroCFSten( hypre_SStructPMatrix *Af,
                                                ac_dbox, hypre_BoxIMin(shift_ibox),
                                                stride, iac);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iac) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                            hypre_BoxLoop1For(iac)
                            {
@@ -258,7 +257,6 @@ hypre_FacZeroFCSten( hypre_SStructPMatrix  *A,
    HYPRE_Real            *a_ptr;
    hypre_Index            loop_size;
 
-   HYPRE_Int              ia;
    HYPRE_Int              fi, fj, i, j;
    HYPRE_Int              abs_shape;
    HYPRE_Int              myid, proc;
@@ -402,7 +400,7 @@ hypre_FacZeroFCSten( hypre_SStructPMatrix  *A,
                                             a_dbox, hypre_BoxIMin(&intersect_box),
                                             stride, ia);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ia) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                         hypre_BoxLoop1For(ia)
                         {
diff --git a/src/sstruct_ls/maxwell_PNedelec.c b/src/sstruct_ls/maxwell_PNedelec.c
index dc2f125..d558b49 100644
--- a/src/sstruct_ls/maxwell_PNedelec.c
+++ b/src/sstruct_ls/maxwell_PNedelec.c
@@ -69,7 +69,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
    HYPRE_Int              nvars, Edge_nvars, part, var;
    HYPRE_Int              tot_vars= 8;
 
-   HYPRE_Int              t, i, j, k, l, m, n, p, r, size;
+   HYPRE_Int              t, i, j, k, l, m, n, p, size;
 
    HYPRE_Int              ilower, iupper;
    HYPRE_Int              jlower, jupper;
@@ -538,16 +538,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
             hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,entry,p,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -568,7 +560,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                   j++;
                }
             }
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
 
          }   /* hypre_ForBoxI */
 
@@ -663,16 +655,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -711,7 +695,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -744,16 +728,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                                  loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -788,7 +765,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -847,16 +824,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      /* increase the loop_size by one in the Z_Face direction to
                         cover upper boundary Z_Faces. */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -893,7 +863,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -927,16 +897,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -973,7 +936,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -1031,16 +994,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the X_Face direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1076,7 +1032,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -1109,16 +1065,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1154,7 +1103,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -1204,16 +1153,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1245,7 +1186,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1268,16 +1209,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1311,7 +1244,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1334,16 +1267,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1386,7 +1311,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1409,16 +1334,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1461,7 +1378,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -1485,16 +1402,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,l,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1536,7 +1445,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                         var_index[0]-= (rfactor[0]-1);
                      }  /* for (p= 1; p< rfactor[1]; p++) */
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -1727,16 +1636,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
             /* note that the correct cbox corresponding to this non-vanishing
                fbox is used. */
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,j,entry,p,cindex,l,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1774,8 +1675,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                   k++;
                }  /* if ((p <= upper_ranks[part][t]) && (p >= lower_ranks[part][t])) */
             }
-
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
          }   /* hypre_ForBoxI */
 
          hypre_TFree(boxoffset);
@@ -1871,16 +1771,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -1943,7 +1836,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y plane direction */
                      hypre_CopyIndex(Edge_cstarts[part][i], cstart);
@@ -1980,16 +1873,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2048,7 +1934,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -2113,16 +1999,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2183,7 +2062,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2221,16 +2100,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+                     
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2289,7 +2161,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                         }     /* for (p= 0; p< rfactor[1]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -2350,16 +2222,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the X plane direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2420,7 +2285,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y plane */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2457,16 +2322,9 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2525,7 +2383,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                         }     /* for (p= 0; p< rfactor[2]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -2576,16 +2434,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2627,7 +2477,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2652,16 +2502,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2703,7 +2545,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2728,16 +2570,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2806,7 +2640,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                         }     /* for (n= 1; n< rfactor[1]; n++) */
                      }        /* for (p= 1; p< rfactor[2]; p++) */
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2831,16 +2665,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2909,7 +2735,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                      }        /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -2934,16 +2760,8 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3011,7 +2829,7 @@ hypre_Maxwell_PNedelec( hypre_SStructGrid    *fgrid_edge,
                         }     /* for (n= 1; n< rfactor[0]; n++) */
                      }        /* for (p= 1; p< rfactor[1]; p++) */
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
diff --git a/src/sstruct_ls/maxwell_TV_setup.c b/src/sstruct_ls/maxwell_TV_setup.c
index 7aecfc9..87556a0 100644
--- a/src/sstruct_ls/maxwell_TV_setup.c
+++ b/src/sstruct_ls/maxwell_TV_setup.c
@@ -32,7 +32,7 @@ hypre_MaxwellTV_Setup(void                 *maxwell_vdata,
                       hypre_SStructVector  *b_in,
                       hypre_SStructVector  *x_in)
 {
-	hypre_MaxwellData     *maxwell_TV_data = (hypre_MaxwellData     *)maxwell_vdata;
+   hypre_MaxwellData     *maxwell_TV_data = (hypre_MaxwellData     *)maxwell_vdata;
 
    MPI_Comm               comm = hypre_SStructMatrixComm(Aee_in);
 
@@ -387,13 +387,12 @@ hypre_MaxwellTV_Setup(void                 *maxwell_vdata,
                hypre_BoxGetSize(box_piece, loop_size);
                hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
 
-               hypre_BoxLoop0Begin(ndim, loop_size);
+               hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #endif
-               hypre_BoxLoop0For()
                {
                   hypre_BoxLoopGetIndex(lindex);
                   hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -404,7 +403,7 @@ hypre_MaxwellTV_Setup(void                 *maxwell_vdata,
                   flag[rank-start_rank] = 0;
                   flag2[rank-start_rank]= rank;
                }
-               hypre_BoxLoop0End();
+               hypre_SerialBoxLoop0End();
             }  /* if (hypre_BoxVolume(box_piece) < i) */
          }  /* for (m= 0; m< hypre_BoxArraySize(tmp_box_array); m++) */
          hypre_BoxArrayDestroy(tmp_box_array);
@@ -490,7 +489,7 @@ hypre_MaxwellTV_Setup(void                 *maxwell_vdata,
                         hypre_SStructVectorParVector(bn),
                         hypre_SStructVectorParVector(xn));
    {
-	   amg_data = (hypre_ParAMGData*) amg_vdata;
+      amg_data = (hypre_ParAMGData*) amg_vdata;
 
       node_numlevels= hypre_ParAMGDataNumLevels(amg_data);
 
diff --git a/src/sstruct_ls/maxwell_grad.c b/src/sstruct_ls/maxwell_grad.c
index ba5fc9e..d53c9da 100644
--- a/src/sstruct_ls/maxwell_grad.c
+++ b/src/sstruct_ls/maxwell_grad.c
@@ -326,15 +326,13 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                hypre_BoxGetSize(box_piece, loop_size);
                hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
          
-               hypre_BoxLoop0Begin(ndim, loop_size);
+               hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #else
-               hypre_BoxLoopSetOneBlock();
 #endif
-               hypre_BoxLoop0For()
                {
                   hypre_BoxLoopGetIndex(lindex);
                   hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -344,7 +342,7 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                                                         &rank, matrix_type);
                   nflag[rank-start_rank1]= 0; 
                }
-               hypre_BoxLoop0End();
+               hypre_SerialBoxLoop0End();
             }  /* if (hypre_BoxVolume(box_piece) < i) */
 
          }  /* for (m= 0; m< hypre_BoxArraySize(tmp_box_array1); m++) */
@@ -435,15 +433,13 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                      hypre_BoxGetSize(box_piece, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #else
-                     hypre_BoxLoopSetOneBlock();
 #endif
-                     hypre_BoxLoop0For()
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -453,7 +449,7 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                                                               &rank, matrix_type);
                         eflag[rank-start_rank2]= 0;
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* if (hypre_BoxVolume(box_piece) < i) */
                }     /* for (k= 0; k< hypre_BoxArraySize(tmp_box_array1); k++) */
 
@@ -467,15 +463,13 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                      hypre_BoxGetSize(box_piece, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(box_piece), start);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0 /* Are private static arrays a problem? */
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,rank) HYPRE_SMP_SCHEDULE
 #endif
 #else
-                     hypre_BoxLoopSetOneBlock();
 #endif
-                     hypre_BoxLoop0For()
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -485,7 +479,7 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                                                               &rank, matrix_type);
                         eflag[rank-start_rank2]= 0;
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* if (hypre_BoxVolume(box_piece) < i) */
                }     /* for (k= 0; k< hypre_BoxArraySize(tmp_box_array2); k++) */
                hypre_BoxArrayDestroy(tmp_box_array2);
@@ -603,15 +597,13 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
             /* Interior box- loop over each edge and find the row rank and 
                then the column ranks for the connected nodes. Change the 
                appropriate values to 1. */
-            hypre_BoxLoop0Begin(ndim, loop_size);
+            hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,m,i,nrows) HYPRE_SMP_SCHEDULE
 #endif
 #else
-            hypre_BoxLoopSetOneBlock();
 #endif
-            hypre_BoxLoop0For()
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -641,7 +633,7 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                ncols[nrows]= 2;
                nrows++;
             }
-            hypre_BoxLoop0End();
+            hypre_SerialBoxLoop0End();
 
             /* now the boundary layers. To cases to consider: is the
                edge totally on the boundary or is the edge connected
@@ -672,15 +664,13 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                   hypre_BoxGetSize(&layer, loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&layer), start);
 
-                  hypre_BoxLoop0Begin(ndim, loop_size);
+                  hypre_SerialBoxLoop0Begin(ndim, loop_size);
 #if 0
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,m,i,nrows) HYPRE_SMP_SCHEDULE
 #endif
 #else
-                  hypre_BoxLoopSetOneBlock();
 #endif
-                  hypre_BoxLoop0For()
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -755,7 +745,7 @@ hypre_Maxwell_Grad(hypre_SStructGrid    *grid)
                      }  /* if (eflag[m-start_rank2]) */
 
                   }
-                  hypre_BoxLoop0End();
+                  hypre_SerialBoxLoop0End();
                }  /* for (ndirection= 0; ndirection< 2; ndirection++) */
             }     /* for (d= 0; d< ndim; d++) */
 
diff --git a/src/sstruct_ls/maxwell_physbdy.c b/src/sstruct_ls/maxwell_physbdy.c
index 9b60d50..0525a7b 100644
--- a/src/sstruct_ls/maxwell_physbdy.c
+++ b/src/sstruct_ls/maxwell_physbdy.c
@@ -476,15 +476,7 @@ hypre_Maxwell_PhysBdy( hypre_SStructGrid      **grid_l,
                      hypre_BoxGetSize(box, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(box), start);
       
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,boxman_entry,cnt) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -497,7 +489,7 @@ hypre_Maxwell_PhysBdy( hypre_SStructGrid      **grid_l,
                         cnt++;
 
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(p, box_array) */
                }     /* hypre_ForBoxArrayI(m, fbdry) */
 
diff --git a/src/sstruct_ls/maxwell_semi_interp.c b/src/sstruct_ls/maxwell_semi_interp.c
index 958006e..351eb45 100644
--- a/src/sstruct_ls/maxwell_semi_interp.c
+++ b/src/sstruct_ls/maxwell_semi_interp.c
@@ -48,7 +48,7 @@ hypre_CreatePTopology(void **PTopology_vdata_ptr)
 HYPRE_Int
 hypre_DestroyPTopology(void *PTopology_vdata)
 {
-	hypre_PTopology       *PTopology= (hypre_PTopology       *)PTopology_vdata;
+   hypre_PTopology       *PTopology= (hypre_PTopology       *)PTopology_vdata;
    HYPRE_Int              ierr     = 0;
 
    if (PTopology)
@@ -148,7 +148,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
    HYPRE_Int              nvars, Face_nvars, Edge_nvars, part, var, box, fboxi;
    HYPRE_Int              tot_vars= 8;
 
-   HYPRE_Int              t, i, j, k, l, m, n, p, r;
+   HYPRE_Int              t, i, j, k, l, m, n, p;
 
    HYPRE_Int              ilower, iupper;
    HYPRE_Int              jlower, jupper;
@@ -1000,15 +1000,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
          /* loop over each cell and find the row rank of Element_edge and then
             the column ranks of the connected fine edges. */
-         hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,findex,entry,rank,nElements,low_index,t,hi_index,var,m,k,j,var_index,nElements_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-         hypre_BoxLoopSetOneBlock();
-#endif
-         hypre_BoxLoop0For()
+         hypre_SerialBoxLoop0Begin(ndim, loop_size);
          {
             hypre_BoxLoopGetIndex(lindex);
             hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1122,7 +1114,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                }        /* for (t= 0; t< Face_nvars; t++) */
             }           /* if (ndim == 2) */
          }
-         hypre_BoxLoop0End();
+         hypre_SerialBoxLoop0End();
       }  /* hypre_ForBoxI(i, cboxes) */
    }     /* for (part= 0; part< nparts; part++) */
 
@@ -1277,15 +1269,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_ClearIndex(stride);
                      hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1354,7 +1338,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                               (rank >= clower_ranks[part][var])) */
                      }
 
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(i, cboxes) */
                   break;
                }   /* case 2:  x_Faces-> y_iedges, z_iedges */
@@ -1382,15 +1366,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_ClearIndex(stride);
                      hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1458,7 +1434,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                               (rank >= clower_ranks[part][var])) */
                      }
 
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(i, cboxes) */
                   break;
                }   /* case 3:  y_Faces-> x_iedges, z_iedges */
@@ -1486,15 +1462,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_ClearIndex(stride);
                      hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,entry,rank,nFaces,cell_index,findex,j,ilower,k,var_index,nFaces_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1562,7 +1530,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                         }  /* if ((rank <= cupper_ranks[part][var]) &&
                               (rank >= clower_ranks[part][var])) */
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
                   }  /* hypre_ForBoxI(i, cboxes) */
                   break;
                }   /* case 4:  z_Faces-> x_iedges, y_iedges */
@@ -1713,15 +1681,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
             hypre_ClearIndex(stride);
             hypre_CopyIndex(upper_shifts[part][fboxi], stride);
 
-            hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,entry,rank,nEdges,cell_index,findex,j,var_index,m,entry,rank,nEdges_iedges) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop0For()
+            hypre_SerialBoxLoop0Begin(ndim, loop_size);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1852,7 +1812,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                }   /* if ((rank <= cupper_ranks[part][var]) &&
                       (rank >= clower_ranks[part][var])) */
             }
-            hypre_BoxLoop0End();
+            hypre_SerialBoxLoop0End();
 
          }  /* hypre_ForBoxI(i, cboxes) */
       }     /* for (t= 0; t< Edge_nvars; t++) */
@@ -1935,15 +1895,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
             hypre_BoxGetSize(cbox, loop_size);
             hypre_CopyIndex(hypre_BoxIMin(cbox), start);
 
-            hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,t,var,entry,rank,nElements_Faces,var_index) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop0For()
+            hypre_SerialBoxLoop0Begin(ndim, loop_size);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -1973,7 +1925,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                }
 
             }
-            hypre_BoxLoop0End();
+            hypre_SerialBoxLoop0End();
          }  /* hypre_ForBoxI(i, cboxes) */
       }  /* if (ndim == 3) */
 
@@ -1994,15 +1946,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
          hypre_BoxGetSize(cbox, loop_size);
          hypre_CopyIndex(hypre_BoxIMin(cbox), start);
 
-         hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,cindex,t,var,entry,rank,nElements_Edges,var_index) HYPRE_SMP_SCHEDULE
-#endif
-#else
-         hypre_BoxLoopSetOneBlock();
-#endif
-         hypre_BoxLoop0For()
+         hypre_SerialBoxLoop0Begin(ndim, loop_size);
          {
             hypre_BoxLoopGetIndex(lindex);
             hypre_SetIndex3(cindex, lindex[0], lindex[1], lindex[2]);
@@ -2161,7 +2105,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                }   /* switch (var) */
             }      /* for (t= 0; t< Edge_nvars; t++) */
          }
-         hypre_BoxLoop0End();
+         hypre_SerialBoxLoop0End();
       }  /* hypre_ForBoxI(i, cboxes) */
    }     /* for (part= 0; part< nparts; part++) */
 
@@ -2368,16 +2312,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
             hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2398,7 +2334,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                   j++;
                }
             }
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
 
          }   /* hypre_ForBoxI */
          hypre_TFree(boxoffset);
@@ -2499,16 +2435,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2549,7 +2477,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2582,16 +2510,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2631,7 +2552,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -2690,16 +2611,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      /* reset and then increase the loop_size by one in the Z_Face direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2738,7 +2652,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2772,16 +2686,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2820,7 +2727,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[2]; n++) */
                         }     /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -2879,16 +2786,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the X_Face direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -2930,7 +2830,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[1]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -2964,16 +2864,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j,l) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3014,7 +2907,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }  /* for (n= 1; n< rfactor[0]; n++) */
                         }     /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
 
                   break;
@@ -3064,16 +2957,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3105,7 +2990,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3128,16 +3013,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3170,7 +3047,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                         }  /* for (n= 0; n< rfactor[0]; n++) */
                      }     /* for (p= 1; p< rfactor[1]; p++) */
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3193,16 +3070,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3245,7 +3114,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3268,16 +3137,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3320,7 +3181,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }  /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -3344,16 +3205,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,k,p,var_index,n,entry,rank,j) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(m)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, m);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3396,7 +3249,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }  /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(m);
+                  hypre_SerialBoxLoop1End(m);
                }  /* hypre_ForBoxI(i, fboxes) */
                break;
             }
@@ -3596,16 +3449,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
             /* note that the correct cbox corresponding to this non-vanishing
                fbox is used. */
 
-            hypre_BoxLoop1Begin(ndim, loop_size,
-                                &copy_box, start, stride, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,j,entry,cindex,var_index,rank,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-            hypre_BoxLoopSetOneBlock();
-#endif
-            hypre_BoxLoop1For(m)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                      &copy_box, start, stride, m);
             {
                hypre_BoxLoopGetIndex(lindex);
                hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3643,7 +3488,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                   k++;
                }
             }
-            hypre_BoxLoop1End(m);
+            hypre_SerialBoxLoop1End(m);
          }   /* hypre_ForBoxI */
          hypre_TFree(boxoffset);
          hypre_TFree(suboffset);
@@ -3754,16 +3599,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -3862,7 +3700,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }
                         }  /* for (p= 0; p< rfactor[0]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyIndex(Edge_cstarts[part][i], cstart);
@@ -3899,16 +3737,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4006,7 +3837,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                         }  /* for (p= 0; p< rfactor[0]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -4070,16 +3901,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the Z plane direction */
                      loop_size[2]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4172,7 +3996,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }
                         }  /* for (p= 0; p< rfactor[1]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* X_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -4210,16 +4034,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4317,7 +4134,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                         }  /* for (p= 0; p< rfactor[1]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -4381,16 +4198,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
 
                      /* increase the loop_size by one in the X plane direction */
                      loop_size[0]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4483,7 +4293,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                            }
                         }  /* for (p= 0; p< rfactor[2]; p++) */
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
 
                      /* Y_Face */
                      hypre_CopyBox(cellbox, &copy_box);
@@ -4520,16 +4330,9 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
                      loop_size[1]++;
-                     hypre_BoxLoop1Begin(ndim, loop_size,
-                                         &copy_box, start, rfactor, m);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,m,lindex,findex,cindex,l,var_index,entry,rank2,rank,p,n,face_w1,face_w2,off_proc_flag,stencil_vals,lower,diag,upper,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop1For(m)
+
+                     hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                               &copy_box, start, rfactor, m);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4627,7 +4430,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                         }  /* for (p= 0; p< rfactor[2]; p++) */
 
                      }
-                     hypre_BoxLoop1End(m);
+                     hypre_SerialBoxLoop1End(m);
                   }  /* hypre_ForBoxI(i, fboxes) */
                   break;
                }
@@ -4685,16 +4488,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4751,7 +4546,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[0]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -4776,16 +4571,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -4842,7 +4629,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }     /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -4867,16 +4654,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -5004,7 +4783,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                         }     /* for (n= 1; n< rfactor[1]; n++) */
                      }        /* for (p= 1; p< rfactor[2]; p++) */
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -5029,16 +4808,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -5167,7 +4938,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }        /* for (p= 1; p< rfactor[2]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
@@ -5192,16 +4963,8 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                                               loop_size);
                   hypre_CopyIndex(hypre_BoxIMin(&copy_box), start);
 
-                  hypre_BoxLoop1Begin(ndim, loop_size,
-                                      &copy_box, start, rfactor, r);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,r,lindex,findex,p,n,m,cindex,entry,rank,var_index,k) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                  hypre_BoxLoopSetOneBlock();
-#endif
-                  hypre_BoxLoop1For(r)
+                  hypre_SerialBoxLoop1Begin(ndim, loop_size,
+                                            &copy_box, start, rfactor, r);
                   {
                      hypre_BoxLoopGetIndex(lindex);
                      hypre_SetIndex3(findex, lindex[0], lindex[1], lindex[2]);
@@ -5330,7 +5093,7 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                      }        /* for (p= 1; p< rfactor[1]; p++) */
 
                   }
-                  hypre_BoxLoop1End(r);
+                  hypre_SerialBoxLoop1End(r);
                }  /* hypre_ForBoxI(i, fboxes) */
 
                break;
diff --git a/src/sstruct_ls/node_relax.c b/src/sstruct_ls/node_relax.c
index 80434a4..a9cf1aa 100644
--- a/src/sstruct_ls/node_relax.c
+++ b/src/sstruct_ls/node_relax.c
@@ -51,7 +51,7 @@ typedef struct
    HYPRE_Real           **A_loc;
    HYPRE_Real            *x_loc;
 
-   /* pointers for vector and matrix data */	
+   /* pointers for vector and matrix data */    
    HYPRE_Real          ***Ap;
    HYPRE_Real           **bp;
    HYPRE_Real           **xp;
@@ -118,7 +118,7 @@ hypre_NodeRelaxCreate( MPI_Comm  comm )
 HYPRE_Int
 hypre_NodeRelaxDestroy( void *relax_vdata )
 {
-	hypre_NodeRelaxData  *relax_data = (hypre_NodeRelaxData  *)relax_vdata;
+   hypre_NodeRelaxData  *relax_data = (hypre_NodeRelaxData  *)relax_vdata;
    HYPRE_Int             i,vi;
    HYPRE_Int             nvars;
 
@@ -147,8 +147,8 @@ hypre_NodeRelaxDestroy( void *relax_vdata )
       hypre_TFree(relax_data -> compute_pkgs);
       hypre_SStructPVectorDestroy(relax_data -> t);
 
-      hypre_TFree(relax_data -> x_loc);
-      hypre_TFree((relax_data ->A_loc)[0]);
+      hypre_UMTFree(relax_data -> x_loc);
+      hypre_UMTFree((relax_data ->A_loc)[0]);
       hypre_TFree(relax_data -> A_loc);
       hypre_TFree(relax_data -> bp);
       hypre_TFree(relax_data -> xp);
@@ -277,9 +277,9 @@ hypre_NodeRelaxSetup(  void                 *relax_vdata,
     * Allocate storage used to invert local diagonal blocks
     *----------------------------------------------------------*/
 
-   x_loc    = hypre_TAlloc(HYPRE_Real   , hypre_NumThreads()*nvars);
+   x_loc    = hypre_UMTAlloc(HYPRE_Real   , hypre_NumThreads()*nvars);
    A_loc    = hypre_TAlloc(HYPRE_Real  *, hypre_NumThreads()*nvars);
-   A_loc[0] = hypre_TAlloc(HYPRE_Real   , hypre_NumThreads()*nvars*nvars);
+   A_loc[0] = hypre_UMTAlloc(HYPRE_Real   , hypre_NumThreads()*nvars*nvars);
    for (vi = 1; vi < hypre_NumThreads()*nvars; vi++)
    {
       A_loc[vi] = A_loc[0] + vi*nvars;
@@ -564,15 +564,8 @@ hypre_NodeRelax(  void               *relax_vdata,
    hypre_Box             *x_data_box;
    hypre_Box             *t_data_box;
                         
-   HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
-                        
    HYPRE_Real           **tA_loc = (relax_data -> A_loc);
    HYPRE_Real            *tx_loc = (relax_data -> x_loc);
-   HYPRE_Real           **A_loc;
-   HYPRE_Real            *x_loc;
 
    HYPRE_Real          ***Ap = (relax_data -> Ap);
    HYPRE_Real           **bp = (relax_data -> bp);
@@ -704,12 +697,14 @@ hypre_NodeRelax(  void               *relax_vdata,
                                    b_data_box, start, stride, bi,
                                    x_data_box, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,bi,xi,vi,vj,x_loc,A_loc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ai, bi, xi)
                {
-                  A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
-                  x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real   **A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real    *x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Int vi, vj;
+                   
                   /*------------------------------------------------
                    * Copy rhs and matrix for diagonal coupling
                    * (intra-nodal) into local storage.
@@ -840,6 +835,7 @@ hypre_NodeRelax(  void               *relax_vdata,
 #endif
                hypre_BoxLoop2For(bi, ti)
                {
+                  HYPRE_Int vi;
                   /* Copy rhs into temp vector */ 
                   for (vi = 0; vi < nvars; vi++)
                   {
@@ -904,12 +900,14 @@ hypre_NodeRelax(  void               *relax_vdata,
                                    A_data_box, start, stride, Ai,
                                    t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,ti,vi,vj,x_loc,A_loc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(Ai, ti)
                {
-                  A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
-                  x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real   **A_loc = &tA_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Real    *x_loc = &tx_loc[hypre_BoxLoopBlock()*nvars];
+                  HYPRE_Int vi, vj;
+                  
                   /*------------------------------------------------
                    * Copy rhs and matrix for diagonal coupling
                    * (intra-nodal) into local storage.
diff --git a/src/sstruct_ls/sstruct_sharedDOFComm.c b/src/sstruct_ls/sstruct_sharedDOFComm.c
index c622c1d..1ea2677 100644
--- a/src/sstruct_ls/sstruct_sharedDOFComm.c
+++ b/src/sstruct_ls/sstruct_sharedDOFComm.c
@@ -48,7 +48,7 @@ hypre_MaxwellOffProcRowCreate(HYPRE_Int ncols)
 HYPRE_Int
 hypre_MaxwellOffProcRowDestroy(void *OffProcRow_vdata)
 {
-	hypre_MaxwellOffProcRow  *OffProcRow= (hypre_MaxwellOffProcRow  *)OffProcRow_vdata;
+   hypre_MaxwellOffProcRow  *OffProcRow= (hypre_MaxwellOffProcRow  *)OffProcRow_vdata;
    HYPRE_Int                 ierr= 0;
 
    if (OffProcRow)
@@ -689,15 +689,7 @@ hypre_SStructSharedDOF_ParcsrMatRowsComm( hypre_SStructGrid    *grid,
                      hypre_BoxGetSize(&boxman_entry_box, loop_size);
                      hypre_CopyIndex(hypre_BoxIMin(&boxman_entry_box), start);
 
-                     hypre_BoxLoop0Begin(ndim, loop_size);
-#if 0
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,lindex,index,entry,rank,tot_nsendRowsNcols,n,col_inds,values,send_ColsData_alloc,k,tot_sendColsData) HYPRE_SMP_SCHEDULE
-#endif
-#else
-                     hypre_BoxLoopSetOneBlock();
-#endif
-                     hypre_BoxLoop0For()
+                     hypre_SerialBoxLoop0Begin(ndim, loop_size);
                      {
                         hypre_BoxLoopGetIndex(lindex);
                         hypre_SetIndex3(index, lindex[0], lindex[1], lindex[2]);
@@ -742,7 +734,7 @@ hypre_SStructSharedDOF_ParcsrMatRowsComm( hypre_SStructGrid    *grid,
                            }  /* if (rank <= end_rank && rank >= start_rank) */
                         }     /* if (entry) */
                      }
-                     hypre_BoxLoop0End();
+                     hypre_SerialBoxLoop0End();
 
                   }  /* if (proc != myproc) */
                }     /* for (m= 0; m< nboxman_entries; m++) */
diff --git a/src/sstruct_mv/_hypre_sstruct_mv.h b/src/sstruct_mv/_hypre_sstruct_mv.h
index 9586842..bddee7c 100644
--- a/src/sstruct_mv/_hypre_sstruct_mv.h
+++ b/src/sstruct_mv/_hypre_sstruct_mv.h
@@ -1,4 +1,7 @@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_SSTRUCT_MV_HEADER
 #define hypre_SSTRUCT_MV_HEADER
 
diff --git a/src/sstruct_mv/headers b/src/sstruct_mv/headers
index 4af2699..048839c 100755
--- a/src/sstruct_mv/headers
+++ b/src/sstruct_mv/headers
@@ -19,6 +19,9 @@ INTERNAL_HEADER=_hypre_sstruct_mv.h
 
 cat > $INTERNAL_HEADER <<@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_SSTRUCT_MV_HEADER
 #define hypre_SSTRUCT_MV_HEADER
 
diff --git a/src/sstruct_mv/sstruct_matrix.c b/src/sstruct_mv/sstruct_matrix.c
index c3e221e..b717957 100644
--- a/src/sstruct_mv/sstruct_matrix.c
+++ b/src/sstruct_mv/sstruct_matrix.c
@@ -728,7 +728,7 @@ hypre_SStructUMatrixInitialize( hypre_SStructMatrix *matrix )
             box = hypre_BoxArrayBox(boxes, b);
             hypre_CopyBox(box, ghost_box);
             if (matrix_type == HYPRE_SSTRUCT || matrix_type == HYPRE_STRUCT)
-	    {
+            {
                hypre_BoxGrowByArray(ghost_box, hypre_StructGridNumGhost(sgrid));
             }
             start = hypre_BoxIMin(box);
@@ -963,7 +963,7 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
    hypre_IndexRef        start;
    hypre_Index           rs, cs;
    HYPRE_Int             row_base, col_base;
-   HYPRE_Int             d, ei, entry, ii, jj, i, mi, vi;
+   HYPRE_Int             ei, entry, ii, jj, i;
    HYPRE_Int             matrix_type = hypre_SStructMatrixObjectType(matrix);
 
    box  = hypre_BoxCreate(ndim);
@@ -982,7 +982,7 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
       int_box = hypre_BoxCreate(ndim);
 
       nrows    = hypre_BoxVolume(vbox)*nentries;
-      ncols    = hypre_CTAlloc(HYPRE_Int, nrows);
+      ncols    = hypre_UMCTAlloc(HYPRE_Int, nrows);
 #ifdef HYPRE_USING_OPENMP
 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
@@ -990,9 +990,9 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
       {
          ncols[i] = 1;
       }
-      rows     = hypre_CTAlloc(HYPRE_Int, nrows);
-      cols     = hypre_CTAlloc(HYPRE_Int, nrows);
-      ijvalues = hypre_CTAlloc(HYPRE_Complex, nrows);
+      rows     = hypre_UMCTAlloc(HYPRE_Int, nrows);
+      cols     = hypre_UMCTAlloc(HYPRE_Int, nrows);
+      ijvalues = hypre_UMCTAlloc(HYPRE_Complex, nrows);
 
       hypre_SetIndex(stride, 1);
 
@@ -1045,14 +1045,18 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
                      
                start = hypre_BoxIMin(int_box);
                hypre_BoxGetSize(int_box, loop_size);
-               hypre_BoxLoop2Begin(ndim, loop_size,
+               /*FIXME: It has to be the old boxloop */
+               zypre_BoxLoop2Begin(ndim, loop_size,
                                    int_box, start, stride, mi,
                                    vbox,    start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi,index,d) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-               hypre_BoxLoop2For(mi, vi)
+               zypre_BoxLoop2For(mi, vi)
                {
+                  hypre_Index index;
+                  HYPRE_Int   d;
+                  
                   hypre_BoxLoopGetIndex(index);
                   rows[nrows + mi] = row_base;
                   cols[nrows + mi] = col_base;
@@ -1063,7 +1067,7 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
                   }
                   ijvalues[nrows + mi] = values[ei + vi*nentries];
                }
-               hypre_BoxLoop2End(mi, vi);
+               zypre_BoxLoop2End(mi, vi);
 
                nrows += hypre_BoxVolume(int_box);
 
@@ -1099,10 +1103,10 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
 
       hypre_TFree(boxman_entries);
       
-      hypre_TFree(ncols);
-      hypre_TFree(rows);
-      hypre_TFree(cols);
-      hypre_TFree(ijvalues);
+      hypre_UMTFree(ncols);
+      hypre_UMTFree(rows);
+      hypre_UMTFree(cols);
+      hypre_UMTFree(ijvalues);
 
       hypre_BoxDestroy(to_box);
       hypre_BoxDestroy(map_box);
@@ -1117,9 +1121,7 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
    {
       /* RDF: THREAD (Check safety on UMatrixSetValues call) */
       hypre_BoxGetSize(vbox, loop_size);
-      hypre_BoxLoop0Begin(ndim, loop_size);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop0For()
+      hypre_SerialBoxLoop0Begin(ndim, loop_size);
       {
          hypre_BoxLoopGetIndex(index);
          hypre_AddIndexes(index, hypre_BoxIMin(vbox), ndim, index);
@@ -1127,7 +1129,7 @@ hypre_SStructUMatrixSetBoxValues( hypre_SStructMatrix *matrix,
                                        nentries, entries, values, action);
          values += nentries;
       }
-      hypre_BoxLoop0End();
+      hypre_SerialBoxLoop0End();
    }
 
    hypre_BoxDestroy(box);
@@ -1383,7 +1385,7 @@ hypre_SStructMatrixSetInterPartValues( HYPRE_SStructMatrix  matrix,
    hypre_SStructBoxManInfo *frinfo, *toinfo;
    HYPRE_Complex           *tvalues = NULL;
    HYPRE_Int                nfrentries, ntoentries, frpart, topart;
-   HYPRE_Int                entry, sentry, ei, fri, toi, vi, mi;
+   HYPRE_Int                entry, sentry, ei, fri, toi;
 
    pmatrix = hypre_SStructMatrixPMatrix(matrix, part);
 
@@ -1481,7 +1483,7 @@ hypre_SStructMatrixSetInterPartValues( HYPRE_SStructMatrix  matrix,
                                          ibox1, start, stride, mi,
                                          vbox,  start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop2For(mi, vi)
                      {
@@ -1513,7 +1515,7 @@ hypre_SStructMatrixSetInterPartValues( HYPRE_SStructMatrix  matrix,
                                          ibox1, start, stride, mi,
                                          vbox,  start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop2For(mi, vi)
                      {
diff --git a/src/sstruct_mv/sstruct_vector.c b/src/sstruct_mv/sstruct_vector.c
index 939c7f8..af21856 100644
--- a/src/sstruct_mv/sstruct_vector.c
+++ b/src/sstruct_mv/sstruct_vector.c
@@ -614,11 +614,9 @@ hypre_SStructVectorParConvert( hypre_SStructVector  *vector,
    hypre_SStructPVector *pvector;
    hypre_StructVector   *y;
    hypre_Box            *y_data_box;
-   HYPRE_Int             yi;
    HYPRE_Complex        *yp;
    hypre_BoxArray       *boxes;
    hypre_Box            *box;
-   HYPRE_Int             bi;
    hypre_Index           loop_size;
    hypre_IndexRef        start;
    hypre_Index           stride;
@@ -655,7 +653,7 @@ hypre_SStructVectorParConvert( hypre_SStructVector  *vector,
                                 y_data_box, start, stride, yi,
                                 box,        start, stride, bi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,bi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(yi, bi)
             {
@@ -698,11 +696,9 @@ hypre_SStructVectorParRestore( hypre_SStructVector *vector,
    hypre_SStructPVector *pvector;
    hypre_StructVector   *y;
    hypre_Box            *y_data_box;
-   HYPRE_Int             yi;
    HYPRE_Complex        *yp;
    hypre_BoxArray       *boxes;
    hypre_Box            *box;
-   HYPRE_Int             bi;
    hypre_Index           loop_size;
    hypre_IndexRef        start;
    hypre_Index           stride;
@@ -741,7 +737,7 @@ hypre_SStructVectorParRestore( hypre_SStructVector *vector,
                                    y_data_box, start, stride, yi,
                                    box,        start, stride, bi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,bi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(yi, bi)
                {
diff --git a/src/struct_ls/HYPRE_struct_int.c b/src/struct_ls/HYPRE_struct_int.c
index 523e5b6..f49744d 100644
--- a/src/struct_ls/HYPRE_struct_int.c
+++ b/src/struct_ls/HYPRE_struct_int.c
@@ -19,7 +19,6 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector,
 {
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Real         *vp;
 
    hypre_BoxArray     *boxes;
@@ -34,7 +33,8 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector,
     * Set the vector coefficients
     *-----------------------------------------------------------------------*/
 
-   srand( seed );
+//   srand( seed );
+   hypre_SeedRand(seed);
 
    hypre_SetIndex3(unit_stride, 1, 1, 1);
  
@@ -53,11 +53,12 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
-         vp[vi] = 2.0*rand()/RAND_MAX - 1.0;
+//         vp[vi] = 2.0*rand()/RAND_MAX - 1.0;
+         vp[vi] = 2.0*hypre_Rand() - 1.0;
       }
       hypre_BoxLoop1End(vi);
    }
diff --git a/src/struct_ls/HYPRE_struct_pcg.c b/src/struct_ls/HYPRE_struct_pcg.c
index 887576d..39d80d9 100644
--- a/src/struct_ls/HYPRE_struct_pcg.c
+++ b/src/struct_ls/HYPRE_struct_pcg.c
@@ -203,10 +203,6 @@ HYPRE_StructDiagScale( HYPRE_StructSolver solver,
    HYPRE_Real           *Ap;
    HYPRE_Real           *yp;
    HYPRE_Real           *xp;
-                       
-   HYPRE_Int             Ai;
-   HYPRE_Int             yi;
-   HYPRE_Int             xi;
                      
    hypre_Index           index;
    hypre_IndexRef        start;
@@ -240,7 +236,7 @@ HYPRE_StructDiagScale( HYPRE_StructSolver solver,
                           x_data_box, start, stride, xi,
                           y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop3For(Ai, xi, yi)
       {
diff --git a/src/struct_ls/_hypre_struct_ls.h b/src/struct_ls/_hypre_struct_ls.h
index 15a8464..6c66698 100644
--- a/src/struct_ls/_hypre_struct_ls.h
+++ b/src/struct_ls/_hypre_struct_ls.h
@@ -32,8 +32,6 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-
 /* coarsen.c */
 HYPRE_Int hypre_StructMapFineToCoarse ( hypre_Index findex , hypre_Index index , hypre_Index stride , hypre_Index cindex );
 HYPRE_Int hypre_StructMapCoarseToFine ( hypre_Index cindex , hypre_Index index , hypre_Index stride , hypre_Index findex );
diff --git a/src/struct_ls/cyclic_reduction.c b/src/struct_ls/cyclic_reduction.c
index 4f88ed6..0e118e9 100644
--- a/src/struct_ls/cyclic_reduction.c
+++ b/src/struct_ls/cyclic_reduction.c
@@ -26,7 +26,7 @@
 #define hypre_CycRedSetCIndex(base_index, base_stride, level, cdir, cindex) \
    {                                                                    \
       if (level > 0)                                                    \
-         hypre_SetIndex3(cindex, 0, 0, 0);                               \
+         hypre_SetIndex3(cindex, 0, 0, 0);                              \
       else                                                              \
          hypre_CopyIndex(base_index,  cindex);                          \
       hypre_IndexD(cindex, cdir) += 0;                                  \
@@ -35,7 +35,7 @@
 #define hypre_CycRedSetFIndex(base_index, base_stride, level, cdir, findex) \
    {                                                                    \
       if (level > 0)                                                    \
-         hypre_SetIndex3(findex, 0, 0, 0);                               \
+         hypre_SetIndex3(findex, 0, 0, 0);                              \
       else                                                              \
          hypre_CopyIndex(base_index,  findex);                          \
       hypre_IndexD(findex, cdir) += 1;                                  \
@@ -44,7 +44,7 @@
 #define hypre_CycRedSetStride(base_index, base_stride, level, cdir, stride) \
    {                                                                    \
       if (level > 0)                                                    \
-         hypre_SetIndex3(stride, 1, 1, 1);                               \
+         hypre_SetIndex3(stride, 1, 1, 1);                              \
       else                                                              \
          hypre_CopyIndex(base_stride, stride);                          \
       hypre_IndexD(stride, cdir) *= 2;                                  \
@@ -238,11 +238,8 @@ hypre_CycRedSetupCoarseOp( hypre_StructMatrix *A,
 
    HYPRE_Real             *a_cc, *a_cw, *a_ce;
    HYPRE_Real             *ac_cc, *ac_cw, *ac_ce;
-                    
-   HYPRE_Int               iA, iAm1, iAp1;
-   HYPRE_Int               iAc;
                          
-   HYPRE_Int               offsetA; 
+   HYPRE_Int               offsetA;
                          
    stridef = cstride;
    hypre_SetIndex3(stridec, 1, 1, 1);
@@ -334,12 +331,12 @@ hypre_CycRedSetupCoarseOp( hypre_StructMatrix *A,
                              A_dbox, fstart, stridef, iA,
                              Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,iAm1,iAp1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(iA, iAc)
          {
-            iAm1 = iA - offsetA;
-            iAp1 = iA + offsetA;
+            HYPRE_Int iAm1 = iA - offsetA;
+            HYPRE_Int iAp1 = iA + offsetA;
 
             ac_cw[iAc] = - a_cw[iA] *a_cw[iAm1] / a_cc[iAm1];
 
@@ -365,12 +362,12 @@ hypre_CycRedSetupCoarseOp( hypre_StructMatrix *A,
                              A_dbox, fstart, stridef, iA,
                              Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc,iAm1,iAp1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(iA, iAc)
          {
-            iAm1 = iA - offsetA;
-            iAp1 = iA + offsetA;
+            HYPRE_Int iAm1 = iA - offsetA;
+            HYPRE_Int iAp1 = iA + offsetA;
 
             ac_cw[iAc] = - a_cw[iA] *a_cw[iAm1] / a_cc[iAm1];
 
@@ -431,7 +428,7 @@ hypre_CycRedSetupCoarseOp( hypre_StructMatrix *A,
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                                 Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
@@ -453,7 +450,7 @@ hypre_CycRedSetupCoarseOp( hypre_StructMatrix *A,
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                                 Ac_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
@@ -482,7 +479,7 @@ hypre_CyclicReductionSetup( void               *cyc_red_vdata,
                             hypre_StructVector *b,
                             hypre_StructVector *x             )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *) cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *) cyc_red_vdata;
 
    MPI_Comm                comm        = (cyc_red_data -> comm);
    HYPRE_Int               cdir        = (cyc_red_data -> cdir);
@@ -594,7 +591,7 @@ hypre_CyclicReductionSetup( void               *cyc_red_vdata,
    /*-----------------------------------------------------
     * Set up matrix and vector structures
     *-----------------------------------------------------*/
-
+   
    A_l  = hypre_TAlloc(hypre_StructMatrix *, num_levels);
    x_l  = hypre_TAlloc(hypre_StructVector *, num_levels);
 
@@ -615,8 +612,9 @@ hypre_CyclicReductionSetup( void               *cyc_red_vdata,
       data_size += hypre_StructVectorDataSize(x_l[l+1]);
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
-
+   //data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   data =  hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+   
    (cyc_red_data -> data) = data;
 
    for (l = 0; l < (num_levels - 1); l++)
@@ -739,7 +737,7 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
                        hypre_StructVector *b,
                        hypre_StructVector *x             )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
 
    HYPRE_Int             num_levels      = (cyc_red_data -> num_levels);
    HYPRE_Int             cdir            = (cyc_red_data -> cdir);
@@ -773,11 +771,6 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
    HYPRE_Real           *xp, *xwp, *xep;
    HYPRE_Real           *bp;
    HYPRE_Real           *xcp;
-                       
-   HYPRE_Int             Ai;
-   HYPRE_Int             xi;
-   HYPRE_Int             bi;
-   HYPRE_Int             xci;
                      
    hypre_Index           cindex;
    hypre_Index           stride;
@@ -821,12 +814,12 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
 
       hypre_CopyIndex(hypre_BoxIMin(compute_box), start);
       hypre_BoxGetStrideSize(compute_box, base_stride, loop_size);
-
+          
       hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size,
                           x_dbox, start, base_stride, xi,
                           b_dbox, start, base_stride, bi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,bi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, bi)
       {
@@ -879,7 +872,7 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
                              A_dbox, start, stride, Ai,
                              x_dbox, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(Ai, xi)
          {
@@ -952,13 +945,13 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
                hypre_StructMapFineToCoarse(start, cindex, stride, startc);
 
                hypre_BoxGetStrideSize(compute_box, stride, loop_size);
-
+                           
                hypre_BoxLoop3Begin(hypre_StructVectorNDim(x), loop_size,
                                    A_dbox, start, stride, Ai,
                                    x_dbox, start, stride, xi,
                                    xc_dbox, startc, stridec, xci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,xci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ai, xi, xci)
                {
@@ -1003,7 +996,7 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
                           A_dbox, start, stride, Ai,
                           x_dbox, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(Ai, xi)
       {
@@ -1128,7 +1121,7 @@ hypre_CyclicReduction( void               *cyc_red_vdata,
                                    A_dbox, start, stride, Ai,
                                    x_dbox, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(Ai, xi)
                {
@@ -1159,7 +1152,7 @@ hypre_CyclicReductionSetBase( void        *cyc_red_vdata,
                               hypre_Index  base_index,
                               hypre_Index  base_stride )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
    HYPRE_Int                d;
  
    for (d = 0; d < 3; d++)
@@ -1181,7 +1174,7 @@ HYPRE_Int
 hypre_CyclicReductionSetCDir( void        *cyc_red_vdata,
                               HYPRE_Int    cdir )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
  
    (cyc_red_data -> cdir) = cdir;
 
@@ -1195,7 +1188,7 @@ hypre_CyclicReductionSetCDir( void        *cyc_red_vdata,
 HYPRE_Int
 hypre_CyclicReductionDestroy( void *cyc_red_vdata )
 {
-	hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
+   hypre_CyclicReductionData *cyc_red_data = (hypre_CyclicReductionData *)cyc_red_vdata;
 
    HYPRE_Int l;
 
@@ -1215,7 +1208,7 @@ hypre_CyclicReductionDestroy( void *cyc_red_vdata )
          hypre_ComputePkgDestroy(cyc_red_data -> up_compute_pkg_l[l]);
       }
       hypre_BoxArrayDestroy(cyc_red_data -> fine_points_l[l]);
-      hypre_SharedTFree(cyc_red_data -> data); 
+      hypre_DeviceTFree(cyc_red_data -> data);
       hypre_TFree(cyc_red_data -> grid_l);
       hypre_TFree(cyc_red_data -> fine_points_l);
       hypre_TFree(cyc_red_data -> A_l);
diff --git a/src/struct_ls/pfmg.c b/src/struct_ls/pfmg.c
index 99a4829..2c9d2fb 100644
--- a/src/struct_ls/pfmg.c
+++ b/src/struct_ls/pfmg.c
@@ -57,7 +57,7 @@ hypre_PFMGCreate( MPI_Comm  comm )
 HYPRE_Int
 hypre_PFMGDestroy( void *pfmg_vdata )
 {
-	hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata;
+   hypre_PFMGData *pfmg_data = (hypre_PFMGData *)pfmg_vdata;
 
    HYPRE_Int l;
    
@@ -73,6 +73,9 @@ hypre_PFMGDestroy( void *pfmg_vdata )
 
       if ((pfmg_data -> num_levels) > -1)
       {
+         HYPRE_Int constant_coefficient =
+            hypre_StructMatrixConstantCoefficient(pfmg_data -> A_l[0]);
+
          for (l = 0; l < (pfmg_data -> num_levels); l++)
          {
             if (pfmg_data -> active_l[l])
@@ -106,7 +109,11 @@ hypre_PFMGDestroy( void *pfmg_vdata )
             hypre_StructVectorDestroy(pfmg_data -> x_l[l+1]);
             hypre_StructVectorDestroy(pfmg_data -> tx_l[l+1]);
          }
-         hypre_SharedTFree(pfmg_data -> data);
+         if (constant_coefficient == 0)
+	   {hypre_DeviceTFree(pfmg_data -> data);}
+         else
+	   {hypre_UMTFree(pfmg_data -> data);}
+      
          hypre_TFree(pfmg_data -> cdir_l);
          hypre_TFree(pfmg_data -> active_l);
          hypre_TFree(pfmg_data -> grid_l);
diff --git a/src/struct_ls/pfmg2_setup_rap.c b/src/struct_ls/pfmg2_setup_rap.c
index ef9a7b7..eb30a50 100644
--- a/src/struct_ls/pfmg2_setup_rap.c
+++ b/src/struct_ls/pfmg2_setup_rap.c
@@ -311,11 +311,7 @@ hypre_PFMG2BuildRAPSym_onebox_FSS5_CC0(
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int             iA, iAm1, iAp1;
    HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                       
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd; 
    HYPRE_Int             xOffsetP; 
@@ -475,21 +471,20 @@ hypre_PFMG2BuildRAPSym_onebox_FSS5_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
-
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
+           
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1];
-
+           
          iP1 = iP - yOffsetP;
          rap_cs[iAc] = rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_cs[iAm1]
             +                   a_cs[iA]   * pa[iP1];
-
          iP1 = iP - yOffsetP + xOffsetP;
          rap_cse[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1];
 
@@ -497,7 +492,7 @@ hypre_PFMG2BuildRAPSym_onebox_FSS5_CC0(
          rap_cw[iAc] =          a_cw[iA]
             +          rb[iR] * a_cw[iAm1] * pb[iP1]
             +          ra[iR] * a_cw[iAp1] * pa[iP1];
-
+           
          rap_cc[iAc] =          a_cc[iA]
             +          rb[iR] * a_cc[iAm1] * pb[iP]
             +          ra[iR] * a_cc[iAp1] * pa[iP]
@@ -529,14 +524,14 @@ hypre_PFMG2BuildRAPSym_onebox_FSS5_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA_diag;
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAm1 = iA - yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1];
 
          iP1 = iP - yOffsetP;
@@ -817,11 +812,7 @@ hypre_PFMG2BuildRAPSym_onebox_FSS9_CC0(
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int             iA, iAm1, iAp1;
    HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                       
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd; 
    HYPRE_Int             xOffsetP; 
@@ -1000,14 +991,14 @@ hypre_PFMG2BuildRAPSym_onebox_FSS9_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
 
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
             +           rb[iR] * a_csw[iAm1]
             +                    a_csw[iA]  * pa[iP1];
@@ -1064,21 +1055,20 @@ hypre_PFMG2BuildRAPSym_onebox_FSS9_CC0(
       a_cnw_offd = a_cnw[iA_offd];
       a_cnw_offdm1 = a_cnw[iA_offdm1];
 
-
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA_diag;
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAm1 = iA - yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1]
             +           rb[iR] * a_csw_offdm1
             +                    a_csw_offd  * pa[iP1];
@@ -1511,10 +1501,8 @@ hypre_PFMG2BuildRAPNoSym_onebox_FSS5_CC0(
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd;
    HYPRE_Int             xOffsetP;
    HYPRE_Int             yOffsetP;
@@ -1668,20 +1656,21 @@ hypre_PFMG2BuildRAPNoSym_onebox_FSS5_CC0(
    if ( constant_coefficient_A == 0 )
    {
       /*hypre_printf("nosym 5.0.0\n");*/
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
 
          iP1 = iP + yOffsetP;
@@ -1713,20 +1702,20 @@ hypre_PFMG2BuildRAPNoSym_onebox_FSS5_CC0(
       a_ce_offd = a_ce[iA_offd];
       a_ce_offdm1 = a_ce[iA_offdm1];
       a_ce_offdp1 = a_ce[iA_offdp1];
-
+ 
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce_offdp1 * pb[iP1];
 
          iP1 = iP + yOffsetP;
@@ -1981,10 +1970,7 @@ hypre_PFMG2BuildRAPNoSym_onebox_FSS9_CC0(
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
    HYPRE_Int             yOffsetA, yOffsetA_diag, yOffsetA_offd;
    HYPRE_Int             xOffsetP;
    HYPRE_Int             yOffsetP;
@@ -2164,20 +2150,21 @@ hypre_PFMG2BuildRAPNoSym_onebox_FSS9_CC0(
    if ( constant_coefficient_A==0 )
    {
       /*hypre_printf("nosym 9.0.0\n");*/
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA;
-         iAp1 = iA + yOffsetA;
+         HYPRE_Int iAm1 = iA - yOffsetA;
+         HYPRE_Int iAp1 = iA + yOffsetA;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]
             +           ra[iR] * a_cne[iAp1]
             +                    a_cne[iA]  * pb[iP1];
@@ -2231,14 +2218,13 @@ hypre_PFMG2BuildRAPNoSym_onebox_FSS9_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - yOffsetA_diag;
-         iAp1 = iA + yOffsetA_diag;
+         HYPRE_Int iAp1 = iA + yOffsetA_diag;
 
-         iP1 = iP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] = ra[iR] * a_ce_offdp1 * pb[iP1]
             +           ra[iR] * a_cne_offdp1
             +                    a_cne_offd  * pb[iP1];
diff --git a/src/struct_ls/pfmg3_setup_rap.c b/src/struct_ls/pfmg3_setup_rap.c
index 8b8bf0a..3355b30 100644
--- a/src/struct_ls/pfmg3_setup_rap.c
+++ b/src/struct_ls/pfmg3_setup_rap.c
@@ -359,10 +359,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS07_CC0(
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn;
    HYPRE_Real           *rap_csw, *rap_cse;
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             zOffsetA_diag; 
@@ -565,14 +562,14 @@ hypre_PFMG3BuildRAPSym_onebox_FSS07_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP - zOffsetP - yOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
          rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1];
 
          iP1 = iP - zOffsetP - xOffsetP;
@@ -632,21 +629,21 @@ hypre_PFMG3BuildRAPSym_onebox_FSS07_CC0(
       a_bc_offdp1 = a_bc[iA_offdp1];
       a_ac_offd = a_ac[iA_offd];
       a_ac_offdm1 = a_ac[iA_offdm1];
-
+       
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP - zOffsetP - yOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
          rap_bs[iAc] = rb[iR] * a_cs_offdm1 * pa[iP1];
 
          iP1 = iP - zOffsetP - xOffsetP;
@@ -656,7 +653,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS07_CC0(
          rap_bc[iAc] =          a_bc_offd   * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc_offdm1;
- 
+
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1];
  
@@ -1015,11 +1012,8 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
-                        
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+            
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             zOffsetA_diag; 
    HYPRE_Int             zOffsetA_offd; 
@@ -1305,14 +1299,14 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1];
 
          iP1 = iP - zOffsetP - yOffsetP;
@@ -1327,7 +1321,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
          rap_bw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
             +          rb[iR] * a_bw[iAm1]
             +                   a_bw[iA]   * pa[iP1];
- 
+
          iP1 = iP - zOffsetP; 
          rap_bc[iAc] =          a_bc[iA] * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
@@ -1348,7 +1342,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
  
          iP1 = iP - zOffsetP + yOffsetP + xOffsetP;
          rap_bne[iAc] = rb[iR] * a_cne[iAm1] * pa[iP1];
-
+         
          iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] =         a_csw[iA]
             +          rb[iR] * a_csw[iAm1] * pb[iP1]
@@ -1435,14 +1429,14 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw_offdm1 * pa[iP1];
 
          iP1 = iP - zOffsetP - yOffsetP;
@@ -1457,12 +1451,12 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
          rap_bw[iAc] = rb[iR] * a_cw_offdm1 * pa[iP1]
             +          rb[iR] * a_bw_offdm1
             +                   a_bw_offd   * pa[iP1];
- 
+
          iP1 = iP - zOffsetP; 
          rap_bc[iAc] =          a_bc_offd * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc_offdm1;
- 
+
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1]
             +          rb[iR] * a_be_offdm1
@@ -1483,7 +1477,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS19_CC0(
          rap_csw[iAc] =         a_csw_offd
             +          rb[iR] * a_csw_offdm1 * pb[iP1]
             +          ra[iR] * a_csw_offdp1 * pa[iP1];
-
+         
          iP1 = iP - yOffsetP;
          rap_cs[iAc] =          a_cs_offd
             +          rb[iR] * a_cs_offdm1 * pb[iP1]
@@ -1975,10 +1969,8 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             zOffsetA_diag; 
@@ -2302,14 +2294,14 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]
             +           rb[iR] * a_bsw[iAm1]
             +                    a_bsw[iA]   * pa[iP1];
@@ -2333,7 +2325,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
          rap_bc[iAc] =          a_bc[iA]   * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc[iAm1];
- 
+
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1]
             +          rb[iR] * a_be[iAm1]
@@ -2362,7 +2354,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
             +                   a_asw[iA]   * pa[iP1]
             +          rb[iR] * a_asw[iAm1]
             +          ra[iR] * a_bsw[iAp1];
-
+         
          iP1 = iP - yOffsetP;
          rap_cs[iAc] =          a_cs[iA]
             +          rb[iR] * a_cs[iAm1] * pb[iP1]
@@ -2461,14 +2453,14 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+         HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
          rap_bsw[iAc] = rb[iR] * a_csw_offdm1 * pa[iP1]
             +           rb[iR] * a_bsw_offdm1
             +                    a_bsw_offd   * pa[iP1];
@@ -2492,7 +2484,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
          rap_bc[iAc] =          a_bc_offd   * pa[iP1]
             +          rb[iR] * a_cc[iAm1] * pa[iP1]
             +          rb[iR] * a_bc_offdm1;
- 
+         
          iP1 = iP - zOffsetP + xOffsetP;
          rap_be[iAc] = rb[iR] * a_ce_offdm1 * pa[iP1]
             +          rb[iR] * a_be_offdm1
@@ -2512,7 +2504,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
          rap_bne[iAc] = rb[iR] * a_cne_offdm1 * pa[iP1]
             +           rb[iR] * a_bne_offdm1
             +                    a_bne_offd   * pa[iP1];
-
+         
          iP1 = iP - yOffsetP - xOffsetP;
          rap_csw[iAc] =          a_csw_offd
             +          rb[iR] * a_csw_offdm1 * pb[iP1]
@@ -2530,7 +2522,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
             +                   a_as_offd   * pa[iP1]
             +          rb[iR] * a_as_offdm1
             +          ra[iR] * a_bs_offdp1;
- 
+         
          iP1 = iP - yOffsetP + xOffsetP;
          rap_cse[iAc] =          a_cse_offd
             +          rb[iR] * a_cse_offdm1 * pb[iP1]
@@ -2539,7 +2531,7 @@ hypre_PFMG3BuildRAPSym_onebox_FSS27_CC0(
             +                   a_ase_offd   * pa[iP1]
             +          rb[iR] * a_ase_offdm1
             +          ra[iR] * a_bse_offdp1;
-
+                 
          iP1 = iP - xOffsetP;
          rap_cw[iAc] =          a_cw_offd
             +          rb[iR] * a_cw_offdm1 * pb[iP1]
@@ -3179,10 +3171,8 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS07_CC0(
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an;
    HYPRE_Real           *rap_cnw, *rap_cne;
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
+
    HYPRE_Int             zOffsetA;
    HYPRE_Int             zOffsetA_diag; 
    HYPRE_Int             zOffsetA_offd; 
@@ -3382,42 +3372,42 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS07_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
-
-         iP1 = iP + zOffsetP + yOffsetP;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
+           
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
          rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1];
 
          iP1 = iP + zOffsetP + xOffsetP;
          rap_ae[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
- 
+           
          iP1 = iP + zOffsetP; 
          rap_ac[iAc] =          a_ac[iA]   * pb[iP1]
             +          ra[iR] * a_cc[iAp1] * pb[iP1]
             +          ra[iR] * a_ac[iAp1];
- 
+           
          iP1 = iP + zOffsetP - xOffsetP;
          rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1];
- 
+           
          iP1 = iP + zOffsetP - yOffsetP;
          rap_as[iAc] = ra[iR] * a_cs[iAp1] * pb[iP1];
- 
+           
          iP1 = iP + yOffsetP;
          rap_cn[iAc] =          a_cn[iA]
             +          rb[iR] * a_cn[iAm1] * pb[iP1]
             +          ra[iR] * a_cn[iAp1] * pa[iP1];
- 
+           
          iP1 = iP + xOffsetP;
          rap_ce[iAc] =          a_ce[iA]
             +          rb[iR] * a_ce[iAm1] * pb[iP1]
             +          ra[iR] * a_ce[iAp1] * pa[iP1];
-
+           
          rap_cnw[iAc] = 0.0;
- 
+           
          rap_cne[iAc] = 0.0;
       }
       hypre_BoxLoop4End(iP, iR, iA, iAc);
@@ -3444,14 +3434,14 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS07_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         //HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP + zOffsetP + yOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
          rap_an[iAc] = ra[iR] * a_cn_offdp1 * pb[iP1];
 
          iP1 = iP + zOffsetP + xOffsetP;
@@ -3785,10 +3775,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS19_CC0(
    HYPRE_Real           *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an;
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
    HYPRE_Int             zOffsetA;
    HYPRE_Int             zOffsetA_diag; 
    HYPRE_Int             zOffsetA_offd; 
@@ -4068,20 +4055,21 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS19_CC0(
 
    if ( constant_coefficient_A == 0 )
    {
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1];
 
          iP1 = iP + zOffsetP + yOffsetP;
@@ -4106,7 +4094,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS19_CC0(
          rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]
             +          ra[iR] * a_aw[iAp1]
             +                   a_aw[iA]   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
          rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1];
 
@@ -4193,14 +4181,14 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS19_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         //HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne_offdp1 * pb[iP1];
 
          iP1 = iP + zOffsetP + yOffsetP;
@@ -4233,7 +4221,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS19_CC0(
          rap_as[iAc] = ra[iR] * a_cs_offdp1 * pb[iP1]
             +          ra[iR] * a_as_offdp1
             +                   a_as_offd   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP - xOffsetP;
          rap_asw[iAc] = ra[iR] * a_csw_offdp1 * pb[iP1];
 
@@ -4250,7 +4238,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS19_CC0(
             +                   a_an_offd   * pa[iP1]
             +          rb[iR] * a_an_offdm1
             +          ra[iR] * a_bn_offdp1;
- 
+
          iP1 = iP + yOffsetP - xOffsetP;
          rap_cnw[iAc] =         a_cnw_offd
             +          rb[iR] * a_cnw_offdm1 * pb[iP1]
@@ -4700,10 +4688,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
 
-   HYPRE_Int             iA, iAm1, iAp1, iA_offd, iA_offdm1, iA_offdp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
+   HYPRE_Int             iA_offd, iA_offdm1, iA_offdp1;
                  
    HYPRE_Int             zOffsetA;
    HYPRE_Int             zOffsetA_diag; 
@@ -5021,20 +5006,21 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
 
    if ( constant_coefficient_A == 0 )
    {
+
       hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                           P_dbox, cstart, stridec, iP,
                           R_dbox, cstart, stridec, iR,
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA;
-         iAp1 = iA + zOffsetA;
+         HYPRE_Int iAm1 = iA - zOffsetA;
+         HYPRE_Int iAp1 = iA + zOffsetA;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]
             +           ra[iR] * a_ane[iAp1]
             +                    a_ane[iA]   * pb[iP1];
@@ -5063,7 +5049,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
          rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]
             +          ra[iR] * a_aw[iAp1]
             +                   a_aw[iA]   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
          rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1]
             +           ra[iR] * a_ase[iAp1]
@@ -5088,7 +5074,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
             +                   a_ane[iA]   * pa[iP1]
             +          rb[iR] * a_ane[iAm1]
             +          ra[iR] * a_bne[iAp1];
-
+         
          iP1 = iP + yOffsetP;
          rap_cn[iAc] =          a_cn[iA]
             +          rb[iR] * a_cn[iAm1] * pb[iP1]
@@ -5177,14 +5163,14 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
                           A_dbox, fstart, stridef, iA,
                           RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(iP, iR, iA, iAc)
       {
-         iAm1 = iA - zOffsetA_diag;
-         iAp1 = iA + zOffsetA_diag;
+         //HYPRE_Int iAm1 = iA - zOffsetA_diag;
+         HYPRE_Int iAp1 = iA + zOffsetA_diag;
 
-         iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+         HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
          rap_ane[iAc] = ra[iR] * a_cne_offdp1 * pb[iP1]
             +           ra[iR] * a_ane_offdp1
             +                    a_ane_offd   * pb[iP1];
@@ -5208,12 +5194,12 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
          rap_ac[iAc] =          a_ac_offd   * pb[iP1]
             +          ra[iR] * a_cc[iAp1] * pb[iP1]
             +          ra[iR] * a_ac_offdp1;
- 
+
          iP1 = iP + zOffsetP - xOffsetP;
          rap_aw[iAc] = ra[iR] * a_cw_offdp1 * pb[iP1]
             +          ra[iR] * a_aw_offdp1
             +                   a_aw_offd   * pb[iP1];
- 
+
          iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
          rap_ase[iAc] = ra[iR] * a_cse_offdp1 * pb[iP1]
             +           ra[iR] * a_ase_offdp1
@@ -5229,7 +5215,6 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
             +           ra[iR] * a_asw_offdp1
             +                    a_asw_offd   * pb[iP1];
 
-
          iP1 = iP + yOffsetP + xOffsetP;
          rap_cne[iAc] =         a_cne_offd
             +          rb[iR] * a_cne_offdm1 * pb[iP1]
@@ -5247,7 +5232,7 @@ hypre_PFMG3BuildRAPNoSym_onebox_FSS27_CC0(
             +                   a_an_offd   * pa[iP1]
             +          rb[iR] * a_an_offdm1
             +          ra[iR] * a_bn_offdp1;
- 
+
          iP1 = iP + yOffsetP - xOffsetP;
          rap_cnw[iAc] =         a_cnw_offd
             +          rb[iR] * a_cnw_offdm1 * pb[iP1]
diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c
index b032c0c..a652abe 100644
--- a/src/struct_ls/pfmg_setup.c
+++ b/src/struct_ls/pfmg_setup.c
@@ -17,19 +17,19 @@
 
 #define hypre_PFMGSetCIndex(cdir, cindex)       \
    {                                            \
-      hypre_SetIndex3(cindex, 0, 0, 0);          \
+      hypre_SetIndex3(cindex, 0, 0, 0);         \
       hypre_IndexD(cindex, cdir) = 0;           \
    }
 
 #define hypre_PFMGSetFIndex(cdir, findex)       \
    {                                            \
-      hypre_SetIndex3(findex, 0, 0, 0);          \
+      hypre_SetIndex3(findex, 0, 0, 0);         \
       hypre_IndexD(findex, cdir) = 1;           \
    }
 
 #define hypre_PFMGSetStride(cdir, stride)       \
    {                                            \
-      hypre_SetIndex3(stride, 1, 1, 1);          \
+      hypre_SetIndex3(stride, 1, 1, 1);         \
       hypre_IndexD(stride, cdir) = 2;           \
    }
 
@@ -42,7 +42,7 @@ hypre_PFMGSetup( void               *pfmg_vdata,
                  hypre_StructVector *b,
                  hypre_StructVector *x        )
 {
-	hypre_PFMGData       *pfmg_data = (hypre_PFMGData       *)pfmg_vdata;
+   hypre_PFMGData       *pfmg_data = (hypre_PFMGData       *)pfmg_vdata;
 
    MPI_Comm              comm = (pfmg_data -> comm);
                      
@@ -104,6 +104,8 @@ hypre_PFMGSetup( void               *pfmg_vdata,
    HYPRE_Int             b_num_ghost[]  = {0, 0, 0, 0, 0, 0};
    HYPRE_Int             x_num_ghost[]  = {1, 1, 1, 1, 1, 1};
 
+   HYPRE_Int              constant_coefficient;
+   
 #if DEBUG
    char                  filename[255];
 #endif
@@ -116,7 +118,8 @@ hypre_PFMGSetup( void               *pfmg_vdata,
 
    grid  = hypre_StructMatrixGrid(A);
    ndim  = hypre_StructGridNDim(grid);
-
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(A);
+   
    /* Compute a new max_levels value based on the grid */
    cbox = hypre_BoxDuplicate(hypre_StructGridBoundingBox(grid));
    max_levels = 1;
@@ -377,7 +380,12 @@ hypre_PFMGSetup( void               *pfmg_vdata,
       hypre_StructVectorInitializeShell(tx_l[l+1]);
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   //data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+   if (constant_coefficient == 0)
+      data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+   else
+      data = hypre_UMCTAlloc(HYPRE_Real,data_size);
+   
    (pfmg_data -> data) = data;
 
    hypre_StructVectorInitializeData(tx_l[0], data);
@@ -730,6 +738,244 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A,
       /* constant_coefficient==0, all coefficients vary with space */
       else
       {
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
+        /*FIXME: need reduction for more variables*/
+	HYPRE_Int tmp = 0;
+	hypre_MatrixIndexMove(A, stencil_size, i, tmp, 3);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcx
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:cx)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,cx);
+         {
+            HYPRE_Int tcx = 0.0;
+            HYPRE_Complex *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            //Ap = (data_A + indices_d[sdiag]);
+            
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+               /* x-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]);
+               if (Astenc)
+               {
+                  tcx -= Ap[Ai]*diag;
+               }
+            }
+
+            cx += tcx;
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,cx);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcx
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:sqcx)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,sqcx);
+         {
+            HYPRE_Int tcx = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+
+               /* x-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]);
+               if (Astenc)
+               {
+                  tcx -= Ap[Ai]*diag;
+               }
+            }       
+            sqcx += (tcx*tcx);
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,sqcx);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcy
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:cy)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,cy);
+         {
+            HYPRE_Int tcy = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+              
+               /* y-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]);
+               if (Astenc)
+               {
+                  tcy -= Ap[Ai]*diag;
+               }
+            }
+
+            cy += tcy;            
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,cy);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcy
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:sqcy)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,sqcy);
+         {
+            HYPRE_Int tcy = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+              
+               /* y-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]);
+               if (Astenc)
+               {
+                  tcy -= Ap[Ai]*diag;
+               }
+            }       
+            sqcy += (tcy*tcy);
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,sqcy);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcz
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:cz)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,cz);
+         {
+            HYPRE_Int tcz = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+              
+               /* z-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]);
+               if (Astenc)
+               {
+                  tcz -= Ap[Ai]*diag;
+               }
+            }
+
+            cz += tcz;            
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,cz);
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR Ai,si,Ap,diag,Astenc,tcz
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:sqcz)
+         hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+                                         A_dbox, start, stride, Ai,sqcz);
+         {
+            HYPRE_Int tcz = 0.0;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
+            /* get sign of diagonal */
+            Ap = hypre_StructGetMatrixBoxData(A, i, sdiag);
+            diag = 1.0;
+            if (Ap[Ai] < 0)
+            {
+               diag = -1.0;
+            }
+
+            for (si = 0; si < stencil_size; si++)
+            {
+               Ap = hypre_StructGetMatrixBoxData(A, i, si);
+
+               /* z-direction */
+               Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]);
+               if (Astenc)
+               {
+                  tcz -= Ap[Ai]*diag;
+               }
+            }       
+            sqcz += (tcz*tcz);
+         }
+         hypre_newBoxLoop1ReductionEnd(Ai,sqcz);
+         hypre_StructCleanIndexD();
+#else
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                              A_dbox, start, stride, Ai);
 #ifdef HYPRE_USING_OPENMP
@@ -737,6 +983,11 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A,
 #endif
          hypre_BoxLoop1For(Ai)
          {
+            HYPRE_Int tcx,tcy,tcz;
+            HYPRE_Real *Ap;
+            HYPRE_Int Astenc,si;
+            HYPRE_Real diag;
+            
             tcx = 0.0;
             tcy = 0.0;
             tcz = 0.0;
@@ -784,9 +1035,10 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A,
             sqcz += (tcz*tcz);
          }
          hypre_BoxLoop1End(Ai);
+#endif
       }
    }
-
+   
    cxyz[0] = cx;
    cxyz[1] = cy;
    cxyz[2] = cz;
@@ -850,7 +1102,7 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A,
       }
       else
       {
-         dxyz[d] = 1.0e+123;
+         dxyz[d] = HYPRE_REAL_MAX/1000;
       }
    }
 
@@ -910,6 +1162,11 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A )
       }
       else
       {
+          /*FIXME: need reduction for multiplication*/
+#if defined(HYPRE_USE_CUDA) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)
+	hypre_newBoxLoop1ReductionMult(hypre_StructMatrixNDim(A), loop_size,
+				       A_dbox, start, stride, Ai,Ap,diag_product);
+#else
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(A), loop_size,
                              A_dbox, start, stride, Ai);
 #ifdef HYPRE_USING_OPENMP
@@ -920,6 +1177,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A )
             diag_product *= Ap[Ai];
          }
          hypre_BoxLoop1End(Ai);
+#endif
       }
    }
 
diff --git a/src/struct_ls/pfmg_setup_interp.c b/src/struct_ls/pfmg_setup_interp.c
index 54f4b1b..6bd003b 100644
--- a/src/struct_ls/pfmg_setup_interp.c
+++ b/src/struct_ls/pfmg_setup_interp.c
@@ -239,25 +239,25 @@ hypre_PFMGSetupInterpOp_CC0
   HYPRE_Int           si0,
   HYPRE_Int           si1 )
 {
-   HYPRE_Int              si;
-   HYPRE_Int              Ai, Pi;
-   HYPRE_Real            *Ap;
-   HYPRE_Real             center;
-   HYPRE_Int              Astenc;
-   HYPRE_Int              mrk0, mrk1;
    hypre_StructStencil   *stencil = hypre_StructMatrixStencil(A);
    hypre_Index           *stencil_shape = hypre_StructStencilShape(stencil);
    HYPRE_Int              stencil_size = hypre_StructStencilSize(stencil);
    HYPRE_Int              warning_cnt= 0;
 
+   hypre_MatrixIndexMove(A, stencil_size, i, cdir,1);
+   
    hypre_BoxLoop2Begin(hypre_StructMatrixNDim(A), loop_size,
                        A_dbox, start, stride, Ai,
                        P_dbox, startc, stridec, Pi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,Pi,si,center,Ap,Astenc,mrk0,mrk1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    hypre_BoxLoop2For(Ai, Pi)
    {
+      HYPRE_Int si,mrk0,mrk1,Astenc;
+      HYPRE_Real center;
+      HYPRE_Real *Ap;
+           
       center  = 0.0;
       Pp0[Pi] = 0.0;
       Pp1[Pi] = 0.0;
@@ -266,9 +266,10 @@ hypre_PFMGSetupInterpOp_CC0
 
       for (si = 0; si < stencil_size; si++)
       {
-         Ap = hypre_StructMatrixBoxData(A, i, si);
-         Astenc = hypre_IndexD(stencil_shape[si], cdir);
-
+         Ap = hypre_StructGetMatrixBoxData(A, i, si);
+        
+         Astenc = hypre_StructGetIndexD(stencil_shape[si], cdir,stencil_shape_d[si]);
+        
          if (Astenc == 0)
          {
             center += Ap[Ai];
@@ -281,7 +282,7 @@ hypre_PFMGSetupInterpOp_CC0
          {
             Pp1[Pi] -= Ap[Ai];
          }
-
+         
          if (si == si0 && Ap[Ai] == 0.0)
             mrk0++;
          if (si == si1 && Ap[Ai] == 0.0)
@@ -290,7 +291,7 @@ hypre_PFMGSetupInterpOp_CC0
 
       if (!center)
       {
-         warning_cnt++;
+         //warning_cnt++;
          Pp0[Pi] = 0.0;
          Pp1[Pi] = 0.0;  
       }
@@ -449,9 +450,9 @@ hypre_PFMGSetupInterpOp_CC2
    HYPRE_Int              Pi;
    HYPRE_Real            *Ap;
    HYPRE_Real             P0, P1;
-   HYPRE_Real             center, center_offd;
+   HYPRE_Real             center_offd;
    HYPRE_Int              Astenc;
-   HYPRE_Int              mrk0, mrk1, mrk0_offd, mrk1_offd;
+   HYPRE_Int              mrk0_offd, mrk1_offd;
    hypre_StructStencil   *stencil = hypre_StructMatrixStencil(A);
    hypre_Index           *stencil_shape = hypre_StructStencilShape(stencil);
    HYPRE_Int              stencil_size = hypre_StructStencilSize(stencil);
@@ -512,23 +513,34 @@ hypre_PFMGSetupInterpOp_CC2
       }
 
       si = diag_rank;
+      
+      hypre_MatrixIndexMove(A, stencil_size, i, si, 1);
       hypre_BoxLoop2Begin(hypre_StructMatrixNDim(A), loop_size,
                           A_dbox, start, stride, Ai,
                           P_dbox, startc, stridec, Pi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,Pi,center,Ap,Astenc,mrk0,mrk1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(Ai, Pi)
       {
-         Pp0[Pi] = P0;
-         Pp1[Pi] = P1;
+         HYPRE_Int   mrk0,mrk1;
+         HYPRE_Real  center;
+         HYPRE_Real *Ap;
+         HYPRE_Real p0val,p1val;
+         
+         //Pp0[Pi] = P0;
+         //Pp1[Pi] = P1;
+         p0val = P0;
+         p1val = P1;
+         
          center = center_offd;
          mrk0 = mrk0_offd;
          mrk1 = mrk1_offd;
+         
+         Ap = hypre_StructGetMatrixBoxData(A, i, si);
+         //Astenc = hypre_IndexD(stencil_shape[si], cdir);
+         //hypre_assert( Astenc==0 );
 
-         Ap = hypre_StructMatrixBoxData(A, i, si);
-         Astenc = hypre_IndexD(stencil_shape[si], cdir);
-         hypre_assert( Astenc==0 );
          center += Ap[Ai];
 
          if (si == si0 && Ap[Ai] == 0.0)
@@ -538,14 +550,18 @@ hypre_PFMGSetupInterpOp_CC2
 
          if (!center)
          {
-            warning_cnt++;
-            Pp0[Pi] = 0.0;
-            Pp1[Pi] = 0.0;  
+            //warning_cnt++;
+            //Pp0[Pi] = 0.0;
+            //Pp1[Pi] = 0.0;
+            p0val = 0;
+            p1val = 0;
          }
          else
          {
-            Pp0[Pi] /= center;
-            Pp1[Pi] /= center;  
+            //Pp0[Pi] /= center;
+            //Pp1[Pi] /= center;
+            p0val /= center;
+            p1val /= center;
          }
 
          /*----------------------------------------------
@@ -554,13 +570,17 @@ hypre_PFMGSetupInterpOp_CC2
           * interpolation and operator stencils reaching
           * outside domain.
           *----------------------------------------------*/
+
          if (mrk0 != 0)
             Pp0[Pi] = 0.0;
          if (mrk1 != 0)
             Pp1[Pi] = 0.0;
+         Pp0[Pi] = p0val;
+         Pp1[Pi] = p1val;
 
       }
       hypre_BoxLoop2End(Ai, Pi);
+      //hypre_StructCleanIndexD();
    }
 
    if (warning_cnt)
diff --git a/src/struct_ls/pfmg_setup_rap5.c b/src/struct_ls/pfmg_setup_rap5.c
index ae0aa5a..1156212 100644
--- a/src/struct_ls/pfmg_setup_rap5.c
+++ b/src/struct_ls/pfmg_setup_rap5.c
@@ -194,12 +194,8 @@ hypre_PFMGBuildCoarseOp5( hypre_StructMatrix *A,
 
    HYPRE_Real           *rap_cc, *rap_cw, *rap_ce;
    HYPRE_Real           *rap_cb, *rap_ca;
-   HYPRE_Real            west, east;
-   HYPRE_Real            center_int, center_bdy;
 
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iPm1, iPp1;
+   HYPRE_Real            center_int, center_bdy;
                       
    HYPRE_Int             OffsetA; 
    HYPRE_Int             OffsetP; 
@@ -348,10 +344,13 @@ hypre_PFMGBuildCoarseOp5( hypre_StructMatrix *A,
                              A_dbox, fstart, stridef, iA,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iA,iAc,iAm1,iAp1,iPm1,iPp1,west,east) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop3For(iP, iA, iAc)
          {
+            HYPRE_Int iAm1,iAp1,iPm1,iPp1;
+            HYPRE_Real  west, east;
+             
             iAm1 = iA - OffsetA;
             iAp1 = iA + OffsetA;
 
@@ -407,7 +406,7 @@ hypre_PFMGBuildCoarseOp5( hypre_StructMatrix *A,
                              A_dbox, fstart, stridef, iA,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(iA, iAc)
          {
@@ -442,7 +441,7 @@ hypre_PFMGBuildCoarseOp5( hypre_StructMatrix *A,
                                 A_dbox, bfstart, stridef, iA,
                                 RAP_dbox, bcstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iA,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(iA, iAc)
             {
diff --git a/src/struct_ls/pfmg_setup_rap7.c b/src/struct_ls/pfmg_setup_rap7.c
index 4bd0abf..fee41dc 100644
--- a/src/struct_ls/pfmg_setup_rap7.c
+++ b/src/struct_ls/pfmg_setup_rap7.c
@@ -201,12 +201,7 @@ hypre_PFMGBuildCoarseOp7( hypre_StructMatrix *A,
 
    HYPRE_Real           *rap_cc, *rap_cw, *rap_ce, *rap_cs, *rap_cn;
    HYPRE_Real           *rap_cb, *rap_ca;
-   HYPRE_Real            west, east, south, north;
    HYPRE_Real            center_int, center_bdy;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iPm1, iPp1;
                       
    HYPRE_Int             OffsetA; 
    HYPRE_Int             OffsetP; 
@@ -373,10 +368,13 @@ hypre_PFMGBuildCoarseOp7( hypre_StructMatrix *A,
                              A_dbox, fstart, stridef, iA,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iA,iAc,iAm1,iAp1,iPm1,iPp1,west,east,south,north) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop3For(iP, iA, iAc)
          {
+            HYPRE_Int iAm1,iAp1,iPm1,iPp1;
+            HYPRE_Real west,east,south,north;
+             
             iAm1 = iA - OffsetA;
             iAp1 = iA + OffsetA;
 
diff --git a/src/struct_ls/pfmg_solve.c b/src/struct_ls/pfmg_solve.c
index 88fd973..e90981d 100644
--- a/src/struct_ls/pfmg_solve.c
+++ b/src/struct_ls/pfmg_solve.c
@@ -64,7 +64,7 @@ hypre_PFMGSolve( void               *pfmg_vdata,
    HYPRE_Int            *active_l        = (pfmg_data -> active_l);
 
    HYPRE_Real            b_dot_b = 0, r_dot_r, eps = 0;
-   HYPRE_Real            e_dot_e, x_dot_x;
+   HYPRE_Real            e_dot_e = 0.0, x_dot_x = 1.0;
                     
    HYPRE_Int             i, l;
    HYPRE_Int             constant_coefficient;
diff --git a/src/struct_ls/point_relax.c b/src/struct_ls/point_relax.c
index 80f4fe3..475a6c7 100644
--- a/src/struct_ls/point_relax.c
+++ b/src/struct_ls/point_relax.c
@@ -102,7 +102,7 @@ hypre_PointRelaxCreate( MPI_Comm  comm )
 HYPRE_Int
 hypre_PointRelaxDestroy( void *relax_vdata )
 {
-	hypre_PointRelaxData *relax_data = (hypre_PointRelaxData *)relax_vdata;
+   hypre_PointRelaxData *relax_data = (hypre_PointRelaxData *)relax_vdata;
    HYPRE_Int             i;
 
    if (relax_data)
@@ -350,9 +350,6 @@ hypre_PointRelax( void               *relax_vdata,
    void                  *matvec_data = NULL;
 
    HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
                         
    hypre_IndexRef         stride;
    hypre_IndexRef         start;
@@ -450,7 +447,7 @@ hypre_PointRelax( void               *relax_vdata,
                hypre_BoxArrayBox(hypre_StructVectorDataSpace(b), i);
             x_data_box =
                hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i);
-
+             
             Ap = hypre_StructMatrixBoxData(A, i, diag_rank);
             bp = hypre_StructVectorBoxData(b, i);
             xp = hypre_StructVectorBoxData(x, i);
@@ -471,7 +468,7 @@ hypre_PointRelax( void               *relax_vdata,
                                       b_data_box, start, stride, bi,
                                       x_data_box, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(bi, xi)
                   {
@@ -488,7 +485,7 @@ hypre_PointRelax( void               *relax_vdata,
                                       b_data_box, start, stride, bi,
                                       x_data_box, start, stride, xi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,bi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, bi, xi)
                   {
@@ -608,7 +605,7 @@ hypre_PointRelax( void               *relax_vdata,
                                       A_data_box, start, stride, Ai,
                                       t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(Ai, ti)
                   {
@@ -711,10 +708,6 @@ hypre_PointRelax_core0( void               *relax_vdata,
    hypre_IndexRef         start;
    hypre_Index            loop_size;
    HYPRE_Int              si, sk, ssi[MAX_DEPTH], depth, k;
-   HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
 
    stencil       = hypre_StructMatrixStencil(A);
    stencil_shape = hypre_StructStencilShape(stencil);
@@ -726,7 +719,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                        b_data_box, start, stride, bi,
                        t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    hypre_BoxLoop2For(bi, ti)
    {
@@ -802,7 +795,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -824,7 +817,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -845,7 +838,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -865,7 +858,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -884,7 +877,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -902,7 +895,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -919,7 +912,7 @@ hypre_PointRelax_core0( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, ti)
             {
@@ -991,9 +984,6 @@ hypre_PointRelax_core12( void               *relax_vdata,
    hypre_Index            loop_size;
    HYPRE_Int              si, sk, ssi[MAX_DEPTH], depth, k;
    HYPRE_Int              Ai;
-   HYPRE_Int              bi;
-   HYPRE_Int              xi;
-   HYPRE_Int              ti;
 
    stencil       = hypre_StructMatrixStencil(A);
    stencil_shape = hypre_StructStencilShape(stencil);
@@ -1018,7 +1008,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                           b_data_box, start, stride, bi,
                           t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(bi, ti)
       {
@@ -1033,7 +1023,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                           b_data_box, start, stride, bi,
                           t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(bi, ti)
       {
@@ -1117,7 +1107,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1144,7 +1134,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1169,7 +1159,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1192,7 +1182,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1213,7 +1203,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1232,7 +1222,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1249,7 +1239,7 @@ hypre_PointRelax_core12( void               *relax_vdata,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1514,7 +1504,7 @@ HYPRE_Int hypre_relax_wtx( void *relax_vdata, HYPRE_Int pointset,
 
    HYPRE_Real weightc = 1 - weight;
    HYPRE_Real *xp, *tp;
-   HYPRE_Int compute_i, i, j, xi, ti;
+   HYPRE_Int compute_i, i, j;
 
    hypre_BoxArrayArray   *compute_box_aa;
    hypre_BoxArray        *compute_box_a;
@@ -1565,7 +1555,7 @@ HYPRE_Int hypre_relax_wtx( void *relax_vdata, HYPRE_Int pointset,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
@@ -1598,7 +1588,7 @@ HYPRE_Int hypre_relax_copy( void *relax_vdata, HYPRE_Int pointset,
    hypre_Index            loop_size;
 
    HYPRE_Real *xp, *tp;
-   HYPRE_Int compute_i, i, j, xi, ti;
+   HYPRE_Int compute_i, i, j;
 
    hypre_BoxArrayArray   *compute_box_aa;
    hypre_BoxArray        *compute_box_a;
@@ -1649,7 +1639,7 @@ HYPRE_Int hypre_relax_copy( void *relax_vdata, HYPRE_Int pointset,
                                 x_data_box, start, stride, xi,
                                 t_data_box, start, stride, ti);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,ti) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(xi, ti)
             {
diff --git a/src/struct_ls/red_black_constantcoef_gs.c b/src/struct_ls/red_black_constantcoef_gs.c
index 593dd09..4524341 100644
--- a/src/struct_ls/red_black_constantcoef_gs.c
+++ b/src/struct_ls/red_black_constantcoef_gs.c
@@ -26,7 +26,7 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
                               hypre_StructVector *b,
                               hypre_StructVector *x )
 {
-	hypre_RedBlackGSData  *relax_data = (hypre_RedBlackGSData  *)relax_vdata;
+   hypre_RedBlackGSData  *relax_data = (hypre_RedBlackGSData  *)relax_vdata;
 
    HYPRE_Int              max_iter    = (relax_data -> max_iter);
    HYPRE_Int              zero_guess  = (relax_data -> zero_guess);
@@ -46,8 +46,8 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
    hypre_Box             *x_dbox;
                         
    HYPRE_Int              Ai, Astart, Ani, Anj;
-   HYPRE_Int              bi, bstart, bni, bnj;
-   HYPRE_Int              xi, xstart, xni, xnj;
+   HYPRE_Int              bstart, bni, bnj;
+   HYPRE_Int              xstart, xni, xnj;
    HYPRE_Int              xoff0, xoff1, xoff2, xoff3, xoff4, xoff5;
                         
    HYPRE_Real            *Ap;
@@ -68,7 +68,7 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
    HYPRE_Int              offd[6];
                         
    HYPRE_Int              iter, rb, redblack, d;
-   HYPRE_Int              compute_i, i, j, ii, jj, kk;
+   HYPRE_Int              compute_i, i, j;
    HYPRE_Int              ni, nj, nk;
 
    /*----------------------------------------------------------
@@ -194,22 +194,17 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
                   Ai= hypre_CCBoxIndexRank(A_dbox, start);
                   AApd= 1.0/Ap[Ai];
 
+                  hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                  for (kk = 0; kk < nk; kk++)
+                  hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                      bstart,bni,bnj,bi,
+                                                      xstart,xni,xnj,xi);
                   {
-                     for (jj = 0; jj < nj; jj++)
-                     {
-                        ii = (kk + jj + redblack) % 2;
-                        bi = bstart + kk*bnj*bni + jj*bni + ii;
-                        xi = xstart + kk*xnj*xni + jj*xni + ii;
-                        for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                        {
-                           xp[xi] = bp[bi]*AApd;
-                        }
-                     }
+                     xp[xi] = bp[bi]*AApd;
                   }
+                  hypre_RedBlackConstantcoefLoopEnd();                  
                }
 
                else      /* variable coefficient diag */
@@ -218,23 +213,18 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
                   Ani = hypre_BoxSizeX(A_dbox);
                   Anj = hypre_BoxSizeY(A_dbox);
 
+                  hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                  for (kk = 0; kk < nk; kk++)
+                  hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                          Astart,Ani,Anj,Ai,
+                                          bstart,bni,bnj,bi,
+                                          xstart,xni,xnj,xi);
                   {
-                     for (jj = 0; jj < nj; jj++)
-                     {
-                        ii = (kk + jj + redblack) % 2;
-                        Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                        bi = bstart + kk*bnj*bni + jj*bni + ii;
-                        xi = xstart + kk*xnj*xni + jj*xni + ii;
-                        for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                        {
-                           xp[xi] = bp[bi] / Ap[Ai];
-                        }
-                     }
+                     xp[xi] = bp[bi] / Ap[Ai];
                   }
+                  hypre_RedBlackLoopEnd();
                }
 
             }
@@ -358,75 +348,61 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
                   switch(stencil_size)
                   {
                      case 7:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                            bstart,bni,bnj,bi,
+                                                            xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] - 
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3] -
-                                     App4*xp[xi + xoff4] -
-                                     App5*xp[xi + xoff5])*AApd;
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] - 
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3] -
+                               App4*xp[xi + xoff4] -
+                               App5*xp[xi + xoff5])*AApd;
                         }
+                        hypre_RedBlackConstantcoefLoopEnd();
+                        
                         break;
 
                      case 5:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                            bstart,bni,bnj,bi,
+                                                            xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] -
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3])*AApd;
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] -
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3])*AApd;
                         }
+                        hypre_RedBlackConstantcoefLoopEnd();
                         break;
 
                      case 3:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,
+                                                            bstart,bni,bnj,bi,
+                                                            xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] -
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1])*AApd;
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] -
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1])*AApd;
                         }
+                        hypre_RedBlackConstantcoefLoopEnd();
                         break;
                   }
 
@@ -441,78 +417,63 @@ hypre_RedBlackConstantCoefGS( void               *relax_vdata,
                   switch(stencil_size)
                   {
                      case 7:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                                Astart,Ani,Anj,Ai,
+                                                bstart,bni,bnj,bi,
+                                                xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] - 
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3] -
-                                     App4*xp[xi + xoff4] -
-                                     App5*xp[xi + xoff5]) / Ap[Ai];
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] - 
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3] -
+                               App4*xp[xi + xoff4] -
+                               App5*xp[xi + xoff5]) / Ap[Ai];
                         }
+                        hypre_RedBlackLoopEnd();
                         break;
 
                      case 5:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                                Astart,Ani,Anj,Ai,
+                                                bstart,bni,bnj,bi,
+                                                xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] - 
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1] -
-                                     App2*xp[xi + xoff2] -
-                                     App3*xp[xi + xoff3]) / Ap[Ai]; 
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] - 
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1] -
+                               App2*xp[xi + xoff2] -
+                               App3*xp[xi + xoff3]) / Ap[Ai]; 
                         }
+                        hypre_RedBlackLoopEnd();
                         break;
 
                      case 3:
+                        hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                        for (kk = 0; kk < nk; kk++)
+                        hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                                Astart,Ani,Anj,Ai,
+                                                bstart,bni,bnj,bi,
+                                                xstart,xni,xnj,xi);
                         {
-                           for (jj = 0; jj < nj; jj++)
-                           {
-                              ii = (kk + jj + redblack) % 2;
-                              Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                              bi = bstart + kk*bnj*bni + jj*bni + ii;
-                              xi = xstart + kk*xnj*xni + jj*xni + ii;
-                              for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                              {
-                                 xp[xi] =
-                                    (bp[bi] -
-                                     App0*xp[xi + xoff0] -
-                                     App1*xp[xi + xoff1]) / Ap[Ai]; 
-                              }
-                           }
+                           xp[xi] =
+                              (bp[bi] -
+                               App0*xp[xi + xoff0] -
+                               App1*xp[xi + xoff1]) / Ap[Ai]; 
                         }
+                        hypre_RedBlackLoopEnd();
                         break;
 
                   }  /* switch(stencil_size) */
diff --git a/src/struct_ls/red_black_gs.c b/src/struct_ls/red_black_gs.c
index 4cd8b32..5b36a93 100644
--- a/src/struct_ls/red_black_gs.c
+++ b/src/struct_ls/red_black_gs.c
@@ -53,7 +53,7 @@ hypre_RedBlackGSCreate( MPI_Comm  comm )
 HYPRE_Int
 hypre_RedBlackGSDestroy( void *relax_vdata )
 {
-	hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata;
+   hypre_RedBlackGSData *relax_data = (hypre_RedBlackGSData *)relax_vdata;
 
    if (relax_data)
    {
@@ -147,9 +147,9 @@ hypre_RedBlackGS( void               *relax_vdata,
    hypre_Box             *b_dbox;
    hypre_Box             *x_dbox;
                         
-   HYPRE_Int              Ai, Astart, Ani, Anj;
-   HYPRE_Int              bi, bstart, bni, bnj;
-   HYPRE_Int              xi, xstart, xni, xnj;
+   HYPRE_Int              Astart, Ani, Anj;
+   HYPRE_Int              bstart, bni, bnj;
+   HYPRE_Int              xstart, xni, xnj;
    HYPRE_Int              xoff0, xoff1, xoff2, xoff3, xoff4, xoff5;
                         
    HYPRE_Real            *Ap;
@@ -166,7 +166,7 @@ hypre_RedBlackGS( void               *relax_vdata,
    HYPRE_Int              offd[6];
                         
    HYPRE_Int              iter, rb, redblack, d;
-   HYPRE_Int              compute_i, i, j, ii, jj, kk;
+   HYPRE_Int              compute_i, i, j;
    HYPRE_Int              ni, nj, nk;
 
    /*----------------------------------------------------------
@@ -288,23 +288,18 @@ hypre_RedBlackGS( void               *relax_vdata,
                   }
                }
 
+               hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-               for (kk = 0; kk < nk; kk++)
+               hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                       Astart,Ani,Anj,Ai,
+                                       bstart,bni,bnj,bi,
+                                       xstart,xni,xnj,xi);
                {
-                  for (jj = 0; jj < nj; jj++)
-                  {
-                     ii = (kk + jj + redblack) % 2;
-                     Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                     bi = bstart + kk*bnj*bni + jj*bni + ii;
-                     xi = xstart + kk*xnj*xni + jj*xni + ii;
-                     for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                     {
-                        xp[xi] = bp[bi] / Ap[Ai];
-                     }
-                  }
+                  xp[xi] = bp[bi] / Ap[Ai];
                }
+               hypre_RedBlackLoopEnd();
             }
          }
       }
@@ -418,78 +413,64 @@ hypre_RedBlackGS( void               *relax_vdata,
                switch(stencil_size)
                {
                   case 7:
+                     hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                     for (kk = 0; kk < nk; kk++)
+                     hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                             Astart,Ani,Anj,Ai,
+                                             bstart,bni,bnj,bi,
+                                             xstart,xni,xnj,xi);
                      {
-                        for (jj = 0; jj < nj; jj++)
-                        {
-                           ii = (kk + jj + redblack) % 2;
-                           Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                           bi = bstart + kk*bnj*bni + jj*bni + ii;
-                           xi = xstart + kk*xnj*xni + jj*xni + ii;
-                           for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                           {
-                              xp[xi] =
-                                 (bp[bi] - 
-                                  Ap0[Ai] * xp[xi + xoff0] -
-                                  Ap1[Ai] * xp[xi + xoff1] -
-                                  Ap2[Ai] * xp[xi + xoff2] -
-                                  Ap3[Ai] * xp[xi + xoff3] -
-                                  Ap4[Ai] * xp[xi + xoff4] -
-                                  Ap5[Ai] * xp[xi + xoff5]) / Ap[Ai];
-                           }
-                        }
+                        xp[xi] =
+                           (bp[bi] - 
+                            Ap0[Ai] * xp[xi + xoff0] -
+                            Ap1[Ai] * xp[xi + xoff1] -
+                            Ap2[Ai] * xp[xi + xoff2] -
+                            Ap3[Ai] * xp[xi + xoff3] -
+                            Ap4[Ai] * xp[xi + xoff4] -
+                            Ap5[Ai] * xp[xi + xoff5]) / Ap[Ai];
                      }
+                     hypre_RedBlackLoopEnd();
                      break;
 
                   case 5:
+                     hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                     for (kk = 0; kk < nk; kk++)
+                     hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                             Astart,Ani,Anj,Ai,
+                                             bstart,bni,bnj,bi,
+                                             xstart,xni,xnj,xi);
                      {
-                        for (jj = 0; jj < nj; jj++)
-                        {
-                           ii = (kk + jj + redblack) % 2;
-                           Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                           bi = bstart + kk*bnj*bni + jj*bni + ii;
-                           xi = xstart + kk*xnj*xni + jj*xni + ii;
-                           for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                           {
-                              xp[xi] =
-                                 (bp[bi] - 
-                                  Ap0[Ai] * xp[xi + xoff0] -
-                                  Ap1[Ai] * xp[xi + xoff1] -
-                                  Ap2[Ai] * xp[xi + xoff2] -
-                                  Ap3[Ai] * xp[xi + xoff3]) / Ap[Ai];
-                           }
-                        }
+                        xp[xi] =
+                           (bp[bi] - 
+                            Ap0[Ai] * xp[xi + xoff0] -
+                            Ap1[Ai] * xp[xi + xoff1] -
+                            Ap2[Ai] * xp[xi + xoff2] -
+                            Ap3[Ai] * xp[xi + xoff3]) / Ap[Ai];
                      }
+                     hypre_RedBlackLoopEnd();
                      break;
 
                   case 3:
+                     hypre_RedBlackLoopInit();
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ii,jj,Ai,bi,xi,kk) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_REDBLACK_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-                     for (kk = 0; kk < nk; kk++)
+                     hypre_RedBlackLoopBegin(ni,nj,nk,redblack,
+                                             Astart,Ani,Anj,Ai,
+                                             bstart,bni,bnj,bi,
+                                             xstart,xni,xnj,xi);
                      {
-                        for (jj = 0; jj < nj; jj++)
-                        {
-                           ii = (kk + jj + redblack) % 2;
-                           Ai = Astart + kk*Anj*Ani + jj*Ani + ii;
-                           bi = bstart + kk*bnj*bni + jj*bni + ii;
-                           xi = xstart + kk*xnj*xni + jj*xni + ii;
-                           for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)
-                           {
-                              xp[xi] =
-                                 (bp[bi] - 
-                                  Ap0[Ai] * xp[xi + xoff0] -
-                                  Ap1[Ai] * xp[xi + xoff1]) / Ap[Ai];
-                           }
-                        }
+                        xp[xi] =
+                           (bp[bi] - 
+                            Ap0[Ai] * xp[xi + xoff0] -
+                            Ap1[Ai] * xp[xi + xoff1]) / Ap[Ai];
                      }
+                     hypre_RedBlackLoopEnd();
+
                      break;
                }
             }
diff --git a/src/struct_ls/red_black_gs.h b/src/struct_ls/red_black_gs.h
index cc7d25f..af901bb 100644
--- a/src/struct_ls/red_black_gs.h
+++ b/src/struct_ls/red_black_gs.h
@@ -42,3 +42,226 @@ typedef struct
 
 } hypre_RedBlackGSData;
 
+#ifdef HYPRE_USE_RAJA
+#define HYPRE_REDBLACK_PRIVATE hypre__global_error
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,	\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,Ai,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    Ai = Astart + kk*Anj*Ani + jj*Ani + ii;			\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackConstantcoefLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}  
+#elif defined(HYPRE_USE_KOKKOS)
+#define HYPRE_REDBLACK_PRIVATE hypre__global_error
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,	\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    HYPRE_Int hypre_fake = 0;						\
+    Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,Ai,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    Ai = Astart + kk*Anj*Ani + jj*Ani + ii;			\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackConstantcoefLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}  
+#elif defined(HYPRE_USE_CUDA)
+#define HYPRE_REDBLACK_PRIVATE hypre__global_error
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,	\
+				bstart,bni,bnj,bi,	\
+				xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    BoxLoopforall(cuda_traversal(),hypre__tot,[=] __device__ (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,Ai,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    Ai = Astart + kk*Anj*Ani + jj*Ani + ii;			\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+	   
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+					    bstart,bni,bnj,bi,	\
+					    xstart,xni,xnj,xi)	\
+{					  \
+    HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);				\
+    BoxLoopforall(cuda_traversal(),hypre__tot,[=] __device__ (HYPRE_Int idx) \
+    {									\
+        HYPRE_Int idx_local = idx;					\
+	HYPRE_Int ii,jj,kk,bi,xi;					\
+	HYPRE_Int local_ii;						\
+	kk = idx_local % nk;						\
+	idx_local = idx_local / nk;					\
+	jj = idx_local % nj;						\
+	idx_local = idx_local / nj;					\
+	local_ii = (kk + jj + redblack) % 2;				\
+	ii = 2*idx_local + local_ii;					\
+	if (ii < ni)							\
+	{								\
+	    bi = bstart + kk*bnj*bni + jj*bni + ii;			\
+	    xi = xstart + kk*xnj*xni + jj*xni + ii;			\
+
+#define hypre_RedBlackConstantcoefLoopEnd()			\
+         }						\
+     });						\
+     hypre_fence();					\
+}
+#else
+#define HYPRE_REDBLACK_PRIVATE hypre__kk
+#define hypre_RedBlackLoopInit()\
+{\
+   HYPRE_Int hypre__kk;
+
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,\
+				Astart,Ani,Anj,Ai,\
+				bstart,bni,bnj,bi,\
+				xstart,xni,xnj,xi)\
+   for (hypre__kk = 0; hypre__kk < nk; hypre__kk++)\
+   {\
+      HYPRE_Int ii,jj,Ai,bi,xi;\
+      for (jj = 0; jj < nj; jj++)\
+      {\
+         ii = (hypre__kk + jj + redblack) % 2;\
+         Ai = Astart + hypre__kk*Anj*Ani + jj*Ani + ii;\
+         bi = bstart + hypre__kk*bnj*bni + jj*bni + ii;\
+         xi = xstart + hypre__kk*xnj*xni + jj*xni + ii;\
+         for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)\
+         {
+
+#define hypre_RedBlackLoopEnd()\
+         }\
+      }\
+   }\
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,\
+                                            bstart,bni,bnj,bi,\
+                                            xstart,xni,xnj,xi)\
+   for (hypre__kk = 0; hypre__kk < nk; hypre__kk++)\
+   {\
+      HYPRE_Int ii,jj,bi,xi;\
+      for (jj = 0; jj < nj; jj++)\
+      {\
+         ii = (hypre__kk + jj + redblack) % 2;\
+         bi = bstart + hypre__kk*bnj*bni + jj*bni + ii;\
+         xi = xstart + hypre__kk*xnj*xni + jj*xni + ii;\
+         for (; ii < ni; ii+=2, Ai+=2, bi+=2, xi+=2)\
+         {
+
+#define hypre_RedBlackConstantcoefLoopEnd()\
+         }\
+      }\
+   }\
+}
+#endif
diff --git a/src/struct_ls/semi_interp.c b/src/struct_ls/semi_interp.c
index e9bfdc5..34df2f2 100644
--- a/src/struct_ls/semi_interp.c
+++ b/src/struct_ls/semi_interp.c
@@ -125,8 +125,6 @@ hypre_SemiInterp( void               *interp_vdata,
    hypre_Box              *e_dbox;
                        
    HYPRE_Int               Pi;
-   HYPRE_Int               xci;
-   HYPRE_Int               ei;
    HYPRE_Int               constant_coefficient;
                          
    HYPRE_Real             *Pp0, *Pp1;
@@ -201,7 +199,7 @@ hypre_SemiInterp( void               *interp_vdata,
                           e_dbox, start, stride, ei,
                           xc_dbox, startc, stridec, xci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(ei, xci)
       {
@@ -280,7 +278,7 @@ hypre_SemiInterp( void               *interp_vdata,
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(P), loop_size,
                                    e_dbox, start, stride, ei);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(ei)
                {
@@ -295,7 +293,7 @@ hypre_SemiInterp( void               *interp_vdata,
                                    P_dbox, startc, stridec, Pi,
                                    e_dbox, start, stride, ei);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Pi,ei) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(Pi, ei)
                {
diff --git a/src/struct_ls/semi_restrict.c b/src/struct_ls/semi_restrict.c
index d104aad..c890fb9 100644
--- a/src/struct_ls/semi_restrict.c
+++ b/src/struct_ls/semi_restrict.c
@@ -123,8 +123,6 @@ hypre_SemiRestrict( void               *restrict_vdata,
    hypre_Box              *rc_dbox;
                        
    HYPRE_Int               Ri;
-   HYPRE_Int               ri;
-   HYPRE_Int               rci;
    HYPRE_Int               constant_coefficient;
 
    HYPRE_Real             *Rp0, *Rp1;
@@ -248,7 +246,7 @@ hypre_SemiRestrict( void               *restrict_vdata,
                                    r_dbox,  start,  stride,  ri,
                                    rc_dbox, startc, stridec, rci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ri,rci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(ri, rci)
                {
@@ -264,7 +262,7 @@ hypre_SemiRestrict( void               *restrict_vdata,
                                    r_dbox,  start,  stride,  ri,
                                    rc_dbox, startc, stridec, rci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ri,ri,rci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ri, ri, rci)
                {
diff --git a/src/struct_ls/semi_setup_rap.c b/src/struct_ls/semi_setup_rap.c
index 1258544..abf02b6 100644
--- a/src/struct_ls/semi_setup_rap.c
+++ b/src/struct_ls/semi_setup_rap.c
@@ -35,7 +35,7 @@
       if (imacro==2) imacro=-1;                         \
       if (jmacro==2) jmacro=-1;                         \
       if (kmacro==2) kmacro=-1;                         \
-      hypre_SetIndex3(indexRAP,imacro,jmacro,kmacro);    \
+      hypre_SetIndex3(indexRAP,imacro,jmacro,kmacro);   \
    }
 
 /*--------------------------------------------------------------------------
@@ -322,11 +322,6 @@ hypre_SemiBuildRAP( hypre_StructMatrix *A,
    HYPRE_Real           *rap_ptrS, *rap_ptrU, *rap_ptrD;
 
    HYPRE_Int             symm_path_multiplier;
-
-   HYPRE_Int             iA, iAp;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iPp;
-   HYPRE_Int             iR;
                         
    HYPRE_Int             COffsetA; 
    HYPRE_Int             COffsetP; 
@@ -561,10 +556,11 @@ hypre_SemiBuildRAP( hypre_StructMatrix *A,
                                             A_dbox, fstart, stridef, iA,
                                             RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                         hypre_BoxLoop4For(iP, iR, iA, iAc)
                         {
+                           HYPRE_Int iAp,iPp;
                            /* path 1 : (stay,stay) */
                            rap_ptrS[iAc] +=          a_ptr[iA]           ;
 
@@ -597,10 +593,11 @@ hypre_SemiBuildRAP( hypre_StructMatrix *A,
                                             A_dbox, fstart, stridef, iA,
                                             RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                         hypre_BoxLoop4For(iP, iR, iA, iAc)
                         {
+                           HYPRE_Int iAp,iPp;
                            /* path 1 : (stay,stay) */
                            rap_ptrS[iAc] +=          a_ptr[iA]           ;
 
@@ -670,10 +667,11 @@ hypre_SemiBuildRAP( hypre_StructMatrix *A,
                                          A_dbox, fstart, stridef, iA,
                                          RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop4For(iP, iR, iA, iAc)
                      {
+                        HYPRE_Int iAp,iPp;
                         /* Path 1 : (stay,up) & symmetric path  */
                         iPp = iP + AOffsetP; 
                         rap_ptrS[iAc] += symm_path_multiplier *
@@ -739,10 +737,11 @@ hypre_SemiBuildRAP( hypre_StructMatrix *A,
                                          A_dbox, fstart, stridef, iA,
                                          RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAp,iPp) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop4For(iP, iR, iA, iAc)
                      {
+                        HYPRE_Int iAp,iPp;
                         /* Path 1 : (stay,up) */
                         iPp = iP + COffsetP + AOffsetP; 
                         rap_ptrU[iAc] +=          a_ptr[iA]  * pb[iPp];
diff --git a/src/struct_ls/smg.c b/src/struct_ls/smg.c
index 3a10636..a6f8000 100644
--- a/src/struct_ls/smg.c
+++ b/src/struct_ls/smg.c
@@ -53,7 +53,7 @@ hypre_SMGCreate( MPI_Comm  comm )
 HYPRE_Int
 hypre_SMGDestroy( void *smg_vdata )
 {
-	hypre_SMGData *smg_data = (hypre_SMGData *)smg_vdata;
+   hypre_SMGData *smg_data = (hypre_SMGData *)smg_vdata;
 
    HYPRE_Int l;
 
@@ -111,7 +111,7 @@ hypre_SMGDestroy( void *smg_vdata )
             hypre_StructVectorDestroy(smg_data -> tb_l[l+1]);
             hypre_StructVectorDestroy(smg_data -> tx_l[l+1]);
          }
-         hypre_SharedTFree(smg_data -> data);
+         hypre_DeviceTFree(smg_data -> data);
          hypre_TFree(smg_data -> grid_l);
          hypre_TFree(smg_data -> PT_grid_l);
          hypre_TFree(smg_data -> A_l);
@@ -468,7 +468,6 @@ hypre_SMGSetStructVectorConstantValues( hypre_StructVector *vector,
 {
    hypre_Box          *v_data_box;
 
-   HYPRE_Int           vi;
    HYPRE_Real         *vp;
 
    hypre_Box          *box;
@@ -495,7 +494,7 @@ hypre_SMGSetStructVectorConstantValues( hypre_StructVector *vector,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
diff --git a/src/struct_ls/smg2_setup_rap.c b/src/struct_ls/smg2_setup_rap.c
index 56ec9bc..e6c54ab 100644
--- a/src/struct_ls/smg2_setup_rap.c
+++ b/src/struct_ls/smg2_setup_rap.c
@@ -186,11 +186,6 @@ hypre_SMG2BuildRAPSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int            iA, iAm1, iAp1;
-   HYPRE_Int            iAc;
-   HYPRE_Int            iP, iP1;
-   HYPRE_Int            iR;
-
    HYPRE_Int            yOffsetA; 
    HYPRE_Int            xOffsetP; 
    HYPRE_Int            yOffsetP; 
@@ -337,7 +332,7 @@ hypre_SMG2BuildRAPSym( hypre_StructMatrix *A,
        * Switch statement to direct control to apropriate BoxLoop depending
        * on stencil size. Default is full 9-point.
        *-----------------------------------------------------------------*/
-
+          
       switch (fine_stencil_size)
       {
 
@@ -357,29 +352,28 @@ hypre_SMG2BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
-
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1];
-
+                
                iP1 = iP - yOffsetP;
                rap_cs[iAc] = rb[iR] * a_cc[iAm1] * pa[iP1]
                   +          rb[iR] * a_cs[iAm1]
                   +                   a_cs[iA]   * pa[iP1];
-
+                
                iP1 = iP - yOffsetP + xOffsetP;
                rap_cse[iAc] = rb[iR] * a_ce[iAm1] * pa[iP1];
-
+                
                iP1 = iP - xOffsetP;
                rap_cw[iAc] =          a_cw[iA]
                   +          rb[iR] * a_cw[iAm1] * pb[iP1]
                   +          ra[iR] * a_cw[iAp1] * pa[iP1];
-
+                
                rap_cc[iAc] =          a_cc[iA]
                   +          rb[iR] * a_cc[iAm1] * pb[iP]
                   +          ra[iR] * a_cc[iAp1] * pa[iP]
@@ -387,10 +381,9 @@ hypre_SMG2BuildRAPSym( hypre_StructMatrix *A,
                   +          ra[iR] * a_cs[iAp1]
                   +                   a_cs[iA]   * pb[iP]
                   +                   a_cn[iA]   * pa[iP];
-
             }
             hypre_BoxLoop4End(iP, iR, iA, iAc);
-
+            
             break;
 
             /*--------------------------------------------------------------
@@ -409,14 +402,14 @@ hypre_SMG2BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
                   +           rb[iR] * a_csw[iAm1]
                   +                    a_csw[iA]  * pa[iP1];
@@ -506,11 +499,6 @@ hypre_SMG2BuildRAPNoSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int            iA, iAm1, iAp1;
-   HYPRE_Int            iAc;
-   HYPRE_Int            iP, iP1;
-   HYPRE_Int            iR;
-
    HYPRE_Int            yOffsetA;
    HYPRE_Int            xOffsetP;
    HYPRE_Int            yOffsetP;
@@ -663,7 +651,7 @@ hypre_SMG2BuildRAPNoSym( hypre_StructMatrix *A,
           *--------------------------------------------------------------*/
 
          case 5:
-
+ 
             hypre_BoxGetSize(cgrid_box, loop_size);
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 PT_dbox,  cstart, stridec, iP,
@@ -671,14 +659,14 @@ hypre_SMG2BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
 
                iP1 = iP + yOffsetP;
@@ -706,7 +694,6 @@ hypre_SMG2BuildRAPNoSym( hypre_StructMatrix *A,
              *--------------------------------------------------------------*/
 
          default:
-
             hypre_BoxGetSize(cgrid_box, loop_size);
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 PT_dbox,  cstart, stridec, iP,
@@ -714,14 +701,14 @@ hypre_SMG2BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]
                   +           ra[iR] * a_cne[iAp1]
                   +                    a_cne[iA]  * pb[iP1];
@@ -785,9 +772,6 @@ hypre_SMG2RAPPeriodicSym( hypre_StructMatrix *RAP,
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int            iAc;
-   HYPRE_Int            iAcm1;
-
    HYPRE_Int            xOffset;
 
    HYPRE_Real           zero = 0.0;
@@ -836,11 +820,11 @@ hypre_SMG2RAPPeriodicSym( hypre_StructMatrix *RAP,
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcm1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
-            iAcm1 = iAc - xOffset;
+            HYPRE_Int iAcm1 = iAc - xOffset;
                
             rap_cw[iAc] += (rap_cse[iAcm1] + rap_csw[iAc]);
             rap_cc[iAc] += (2.0 * rap_cs[iAc]);
@@ -850,7 +834,7 @@ hypre_SMG2RAPPeriodicSym( hypre_StructMatrix *RAP,
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
@@ -895,8 +879,6 @@ hypre_SMG2RAPPeriodicNoSym( hypre_StructMatrix *RAP,
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
 
-   HYPRE_Int            iAc;
-
    HYPRE_Real           zero = 0.0;
 
    hypre_SetIndex3(stridec, 1, 1, 1);
@@ -950,7 +932,7 @@ hypre_SMG2RAPPeriodicNoSym( hypre_StructMatrix *RAP,
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
diff --git a/src/struct_ls/smg3_setup_rap.c b/src/struct_ls/smg3_setup_rap.c
index 0c71e2b..27be9a9 100644
--- a/src/struct_ls/smg3_setup_rap.c
+++ b/src/struct_ls/smg3_setup_rap.c
@@ -268,11 +268,6 @@ hypre_SMG3BuildRAPSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn;
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             xOffsetP; 
@@ -574,14 +569,14 @@ hypre_SMG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
-            {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+            {                   
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
                rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - xOffsetP;
@@ -638,14 +633,14 @@ hypre_SMG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
                rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1]
                   +          rb[iR] * a_bs[iAm1]
                   +                   a_bs[iA]   * pa[iP1];
@@ -719,14 +714,14 @@ hypre_SMG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - yOffsetP;
@@ -759,7 +754,7 @@ hypre_SMG3BuildRAPSym( hypre_StructMatrix *A,
                rap_bn[iAc] = rb[iR] * a_cn[iAm1] * pa[iP1]
                   +          rb[iR] * a_bn[iAm1]
                   +                   a_bn[iA]   * pa[iP1];
- 
+
                iP1 = iP - zOffsetP + yOffsetP + xOffsetP;
                rap_bne[iAc] = rb[iR] * a_cne[iAm1] * pa[iP1];
 
@@ -822,14 +817,14 @@ hypre_SMG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]
                   +           rb[iR] * a_bsw[iAm1]
                   +                    a_bsw[iA]   * pa[iP1];
@@ -982,11 +977,6 @@ hypre_SMG3BuildRAPNoSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
 
-   HYPRE_Int            iA, iAm1, iAp1;
-   HYPRE_Int            iAc;
-   HYPRE_Int            iP, iP1;
-   HYPRE_Int            iR;
-
    HYPRE_Int            zOffsetA;
    HYPRE_Int            xOffsetP;
    HYPRE_Int            yOffsetP;
@@ -1282,14 +1272,14 @@ hypre_SMG3BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
                rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + xOffsetP;
@@ -1337,14 +1327,14 @@ hypre_SMG3BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
                rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1]
                   +          ra[iR] * a_an[iAp1]
                   +                   a_an[iA]   * pb[iP1];
@@ -1411,14 +1401,14 @@ hypre_SMG3BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int  iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + yOffsetP;
@@ -1443,7 +1433,7 @@ hypre_SMG3BuildRAPNoSym( hypre_StructMatrix *A,
                rap_aw[iAc] = ra[iR] * a_cw[iAp1] * pb[iP1]
                   +          ra[iR] * a_aw[iAp1]
                   +                   a_aw[iA]   * pb[iP1];
- 
+
                iP1 = iP + zOffsetP - yOffsetP + xOffsetP;
                rap_ase[iAc] = ra[iR] * a_cse[iAp1] * pb[iP1];
 
@@ -1506,14 +1496,14 @@ hypre_SMG3BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox,   fstart, stridef, iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]
                   +           ra[iR] * a_ane[iAp1]
                   +                    a_ane[iA]   * pb[iP1];
@@ -1637,12 +1627,6 @@ hypre_SMG3RAPPeriodicSym( hypre_StructMatrix *RAP,
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int            iAc;
-   HYPRE_Int            iAcmx;
-   HYPRE_Int            iAcmy;
-   HYPRE_Int            iAcmxmy;
-   HYPRE_Int            iAcpxmy;
-
    HYPRE_Int            xOffset;
    HYPRE_Int            yOffset;
 
@@ -1739,12 +1723,12 @@ hypre_SMG3RAPPeriodicSym( hypre_StructMatrix *RAP,
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcmx,iAcmy) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
-            iAcmx = iAc - xOffset;
-            iAcmy = iAc - yOffset;
+            HYPRE_Int iAcmx = iAc - xOffset;
+            HYPRE_Int iAcmy = iAc - yOffset;
                
             rap_cc[iAc] += (2.0 * rap_bc[iAc]);
             rap_cw[iAc] += (rap_bw[iAc] + rap_be[iAcmx]);
@@ -1755,7 +1739,7 @@ hypre_SMG3RAPPeriodicSym( hypre_StructMatrix *RAP,
          hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                              RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(iAc)
          {
@@ -1778,12 +1762,12 @@ hypre_SMG3RAPPeriodicSym( hypre_StructMatrix *RAP,
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                                 RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc,iAcmxmy,iAcpxmy) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
-               iAcmxmy = iAc - xOffset - yOffset;
-               iAcpxmy = iAc + xOffset - yOffset;
+               HYPRE_Int iAcmxmy = iAc - xOffset - yOffset;
+               HYPRE_Int iAcpxmy = iAc + xOffset - yOffset;
                   
                rap_csw[iAc] += (rap_bsw[iAc] + rap_bne[iAcmxmy]);
                   
@@ -1795,7 +1779,7 @@ hypre_SMG3RAPPeriodicSym( hypre_StructMatrix *RAP,
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                                 RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
@@ -1845,8 +1829,6 @@ hypre_SMG3RAPPeriodicNoSym( hypre_StructMatrix *RAP,
    HYPRE_Real           *rap_csw, *rap_cse, *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
 
-   HYPRE_Int            iAc;
-
    HYPRE_Real           zero = 0.0;
 
    hypre_StructStencil *stencil;
@@ -1974,7 +1956,7 @@ hypre_SMG3RAPPeriodicNoSym( hypre_StructMatrix *RAP,
                              RAP_dbox,  cstart, stridec, iAc);
 
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
 
          hypre_BoxLoop1For(iAc)
@@ -2012,7 +1994,7 @@ hypre_SMG3RAPPeriodicNoSym( hypre_StructMatrix *RAP,
             hypre_BoxLoop1Begin(hypre_StructMatrixNDim(RAP), loop_size,
                                 RAP_dbox,  cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iAc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(iAc)
             {
diff --git a/src/struct_ls/smg_axpy.c b/src/struct_ls/smg_axpy.c
index 0fe53a7..479f6b6 100644
--- a/src/struct_ls/smg_axpy.c
+++ b/src/struct_ls/smg_axpy.c
@@ -25,9 +25,6 @@ hypre_SMGAxpy( HYPRE_Real          alpha,
    HYPRE_Int         ndim = hypre_StructVectorNDim(x);
    hypre_Box        *x_data_box;
    hypre_Box        *y_data_box;
-                 
-   HYPRE_Int         xi;
-   HYPRE_Int         yi;
                     
    HYPRE_Real       *xp;
    HYPRE_Real       *yp;
@@ -58,7 +55,7 @@ hypre_SMGAxpy( HYPRE_Real          alpha,
                           x_data_box, start, base_stride, xi,
                           y_data_box, start, base_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, yi)
       {
diff --git a/src/struct_ls/smg_relax.c b/src/struct_ls/smg_relax.c
index 35a6749..4d91ff8 100644
--- a/src/struct_ls/smg_relax.c
+++ b/src/struct_ls/smg_relax.c
@@ -263,7 +263,6 @@ hypre_SMGRelax( void               *relax_vdata,
    residual_data   = (relax_data -> residual_data);
    solve_data      = (relax_data -> solve_data);
 
-
    /*----------------------------------------------------------
     * Set zero values
     *----------------------------------------------------------*/
@@ -315,7 +314,6 @@ hypre_SMGRelax( void               *relax_vdata,
          (relax_data -> num_iterations) = (i + 1);
       }
    }
-
    /*----------------------------------------------------------
     * Free up memory according to memory_use parameter
     *----------------------------------------------------------*/
@@ -586,6 +584,7 @@ hypre_SMGRelaxSetupASol( void               *relax_vdata,
          hypre_SMGSetMemoryUse(solve_data[i], (relax_data -> memory_use));
          hypre_SMGSetTol(solve_data[i], 0.0);
          hypre_SMGSetMaxIter(solve_data[i], 1);
+		 
          hypre_SMGSetup(solve_data[i], A_sol, temp_vec, x);
       }
       else
diff --git a/src/struct_ls/smg_residual.c b/src/struct_ls/smg_residual.c
index 9ad5d94..de0badb 100644
--- a/src/struct_ls/smg_residual.c
+++ b/src/struct_ls/smg_residual.c
@@ -138,11 +138,6 @@ hypre_SMGResidual( void               *residual_vdata,
    hypre_Box              *x_data_box;
    hypre_Box              *b_data_box;
    hypre_Box              *r_data_box;
-                       
-   HYPRE_Int               Ai;
-   HYPRE_Int               xi;
-   HYPRE_Int               bi;
-   HYPRE_Int               ri;
                          
    HYPRE_Real             *Ap;
    HYPRE_Real             *xp;
@@ -201,7 +196,7 @@ hypre_SMGResidual( void               *residual_vdata,
                                    b_data_box, start, base_stride, bi,
                                    r_data_box, start, base_stride, ri);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,bi,ri) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(bi, ri)
                {
@@ -253,7 +248,7 @@ hypre_SMGResidual( void               *residual_vdata,
                                    x_data_box, start, base_stride, xi,
                                    r_data_box, start, base_stride, ri);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,xi,ri) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop3For(Ai, xi, ri)
                {
diff --git a/src/struct_ls/smg_setup.c b/src/struct_ls/smg_setup.c
index 13a753d..96077b1 100644
--- a/src/struct_ls/smg_setup.c
+++ b/src/struct_ls/smg_setup.c
@@ -186,6 +186,7 @@ hypre_SMGSetup( void               *smg_vdata,
    for (l = 0; l < (num_levels - 1); l++)
    {
       PT_l[l]  = hypre_SMGCreateInterpOp(A_l[l], PT_grid_l[l+1], cdir);
+	  
       hypre_StructMatrixInitializeShell(PT_l[l]);
       data_size += hypre_StructMatrixDataSize(PT_l[l]);
 
@@ -228,7 +229,8 @@ hypre_SMGSetup( void               *smg_vdata,
       hypre_StructVectorInitializeShell(tx_l[l+1]);
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
+
    (smg_data -> data) = data;
 
    hypre_StructVectorInitializeData(tb_l[0], data);
@@ -331,6 +333,7 @@ hypre_SMGSetup( void               *smg_vdata,
       hypre_SMGRelaxSetTempVec(relax_data_l[l], tb_l[l]);
       hypre_SMGRelaxSetNumPreRelax( relax_data_l[l], n_pre);
       hypre_SMGRelaxSetNumPostRelax( relax_data_l[l], n_post);
+
       hypre_SMGRelaxSetup(relax_data_l[l], A_l[l], b_l[l], x_l[l]);
 
       hypre_SMGSetupInterpOp(relax_data_l[l], A_l[l], b_l[l], x_l[l],
diff --git a/src/struct_ls/smg_setup_interp.c b/src/struct_ls/smg_setup_interp.c
index dd229d2..677a671 100644
--- a/src/struct_ls/smg_setup_interp.c
+++ b/src/struct_ls/smg_setup_interp.c
@@ -113,8 +113,6 @@ hypre_SMGSetupInterpOp( void               *relax_data,
    hypre_Box            *x_data_box;
    HYPRE_Real           *PTp;
    HYPRE_Real           *xp;
-   HYPRE_Int             PTi;
-   HYPRE_Int             xi;
 
    hypre_Index           loop_size;
    hypre_Index           start;
@@ -263,7 +261,7 @@ hypre_SMGSetupInterpOp( void               *relax_data,
                                    x_data_box,  start,  stride,  xi,
                                    PT_data_box, startc, stridec, PTi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,PTi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop2For(xi, PTi)
                {
diff --git a/src/struct_ls/sparse_msg2_setup_rap.c b/src/struct_ls/sparse_msg2_setup_rap.c
index 01bac00..d955967 100644
--- a/src/struct_ls/sparse_msg2_setup_rap.c
+++ b/src/struct_ls/sparse_msg2_setup_rap.c
@@ -201,11 +201,6 @@ hypre_SparseMSG2BuildRAPSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_cc, *rap_cw, *rap_cs;
    HYPRE_Real           *rap_csw, *rap_cse;
 
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
-                      
    HYPRE_Int             yOffsetA; 
    HYPRE_Int             xOffsetP; 
    HYPRE_Int             yOffsetP; 
@@ -397,14 +392,14 @@ hypre_SparseMSG2BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1];
 
                iP1 = iP - yOffsetP;
@@ -442,21 +437,20 @@ hypre_SparseMSG2BuildRAPSym( hypre_StructMatrix *A,
          default:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - yOffsetP - xOffsetP;
                rap_csw[iAc] = rb[iR] * a_cw[iAm1] * pa[iP1]
                   +           rb[iR] * a_csw[iAm1]
                   +                    a_csw[iA]  * pa[iP1];
@@ -548,11 +542,6 @@ hypre_SparseMSG2BuildRAPNoSym( hypre_StructMatrix *A,
 
    HYPRE_Real           *rap_ce, *rap_cn;
    HYPRE_Real           *rap_cnw, *rap_cne;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                      
    HYPRE_Int             yOffsetA;
    HYPRE_Int             xOffsetP;
@@ -730,21 +719,20 @@ hypre_SparseMSG2BuildRAPNoSym( hypre_StructMatrix *A,
          case 5:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1];
 
                iP1 = iP + yOffsetP;
@@ -773,21 +761,20 @@ hypre_SparseMSG2BuildRAPNoSym( hypre_StructMatrix *A,
          default:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - yOffsetA;
-               iAp1 = iA + yOffsetA;
+               HYPRE_Int iAm1 = iA - yOffsetA;
+               HYPRE_Int iAp1 = iA + yOffsetA;
 
-               iP1 = iP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + yOffsetP + xOffsetP;
                rap_cne[iAc] = ra[iR] * a_ce[iAp1] * pb[iP1]
                   +           ra[iR] * a_cne[iAp1]
                   +                    a_cne[iA]  * pb[iP1];
diff --git a/src/struct_ls/sparse_msg3_setup_rap.c b/src/struct_ls/sparse_msg3_setup_rap.c
index 1a6646d..722ac9e 100644
--- a/src/struct_ls/sparse_msg3_setup_rap.c
+++ b/src/struct_ls/sparse_msg3_setup_rap.c
@@ -225,11 +225,6 @@ hypre_SparseMSG3BuildRAPSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_bc, *rap_bw, *rap_be, *rap_bs, *rap_bn;
    HYPRE_Real           *rap_csw, *rap_cse;
    HYPRE_Real           *rap_bsw, *rap_bse, *rap_bnw, *rap_bne;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                         
    HYPRE_Int             zOffsetA; 
    HYPRE_Int             xOffsetP; 
@@ -563,14 +558,14 @@ hypre_SparseMSG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
-            {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+            {           
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP;
                rap_bs[iAc] = rb[iR] * a_cs[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - xOffsetP;
@@ -633,14 +628,14 @@ hypre_SparseMSG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1];
 
                iP1 = iP - zOffsetP - yOffsetP;
@@ -737,14 +732,14 @@ hypre_SparseMSG3BuildRAPSym( hypre_StructMatrix *A,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
+               HYPRE_Int iP1 = iP - zOffsetP - yOffsetP - xOffsetP;
                rap_bsw[iAc] = rb[iR] * a_csw[iAm1] * pa[iP1]
                   +           rb[iR] * a_bsw[iAm1]
                   +                    a_bsw[iA]   * pa[iP1];
@@ -898,11 +893,6 @@ hypre_SparseMSG3BuildRAPNoSym( hypre_StructMatrix *A,
    HYPRE_Real           *rap_ac, *rap_aw, *rap_ae, *rap_as, *rap_an;
    HYPRE_Real           *rap_cnw, *rap_cne;
    HYPRE_Real           *rap_asw, *rap_ase, *rap_anw, *rap_ane;
-
-   HYPRE_Int             iA, iAm1, iAp1;
-   HYPRE_Int             iAc;
-   HYPRE_Int             iP, iP1;
-   HYPRE_Int             iR;
                  
    HYPRE_Int             zOffsetA;
    HYPRE_Int             xOffsetP;
@@ -1221,21 +1211,20 @@ hypre_SparseMSG3BuildRAPNoSym( hypre_StructMatrix *A,
          case 7:
 
             hypre_BoxGetSize(cgrid_box, loop_size);
-
             hypre_BoxLoop4Begin(hypre_StructMatrixNDim(A), loop_size,
                                 P_dbox, Pstart, stridePR, iP,
                                 R_dbox, Pstart, stridePR, iR,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP;
                rap_an[iAc] = ra[iR] * a_cn[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + xOffsetP;
@@ -1289,14 +1278,14 @@ hypre_SparseMSG3BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1];
 
                iP1 = iP + zOffsetP + yOffsetP;
@@ -1385,14 +1374,14 @@ hypre_SparseMSG3BuildRAPNoSym( hypre_StructMatrix *A,
                                 A_dbox, fstart, stridef,  iA,
                                 RAP_dbox, cstart, stridec, iAc);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,iP,iR,iA,iAc,iAm1,iAp1,iP1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop4For(iP, iR, iA, iAc)
             {
-               iAm1 = iA - zOffsetA;
-               iAp1 = iA + zOffsetA;
+               HYPRE_Int iAm1 = iA - zOffsetA;
+               HYPRE_Int iAp1 = iA + zOffsetA;
 
-               iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
+               HYPRE_Int iP1 = iP + zOffsetP + yOffsetP + xOffsetP;
                rap_ane[iAc] = ra[iR] * a_cne[iAp1] * pb[iP1]
                   +           ra[iR] * a_ane[iAp1]
                   +                    a_ane[iA]   * pb[iP1];
diff --git a/src/struct_ls/sparse_msg_filter.c b/src/struct_ls/sparse_msg_filter.c
index c321f63..d493ed8 100644
--- a/src/struct_ls/sparse_msg_filter.c
+++ b/src/struct_ls/sparse_msg_filter.c
@@ -325,22 +325,14 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
    hypre_Box             *A_dbox;
    hypre_Box             *v_dbox;
                         
-   HYPRE_Int              Ai;
-   HYPRE_Int              vi;
-                        
-   HYPRE_Real            *Ap;
    HYPRE_Real            *vxp;
    HYPRE_Real            *vyp;
    HYPRE_Real            *vzp;
-   HYPRE_Real             lambdax;
-   HYPRE_Real             lambday;
-   HYPRE_Real             lambdaz;
+
                         
    hypre_StructStencil   *stencil;
    hypre_Index           *stencil_shape;
-   HYPRE_Int              stencil_size;
-                        
-   HYPRE_Int              Astenc;
+   HYPRE_Int              stencil_size;                        
                         
    hypre_Index            loop_size;
    hypre_Index            cindex;
@@ -349,7 +341,7 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
    hypre_Index            stride;
    hypre_Index            stridev;
                         
-   HYPRE_Int              i, si;
+   HYPRE_Int              i;
 
    /*----------------------------------------------------------
     * Initialize some things
@@ -375,6 +367,9 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
    compute_boxes = hypre_StructGridBoxes(hypre_StructMatrixGrid(A));
    hypre_ForBoxI(i, compute_boxes)
    {
+
+      hypre_MatrixIndexMove(A, stencil_size, i, ierr,3);
+     
       compute_box = hypre_BoxArrayBox(compute_boxes, i);
 
       A_dbox = hypre_BoxArrayBox(hypre_StructMatrixDataSpace(A), i);
@@ -392,20 +387,28 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
                           A_dbox, start,  stride,  Ai,
                           v_dbox, startv, stridev, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ai,vi,lambdax,lambday,lambdaz,si,Ap,Astenc) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(Ai, vi)
       {
+         HYPRE_Real lambdax,lambday,lambdaz;
+         HYPRE_Real *Ap;
+         HYPRE_Int si,Astenc;
+		  
          lambdax = 0.0;
          lambday = 0.0;
          lambdaz = 0.0;
 
+		 
          for (si = 0; si < stencil_size; si++)
          {
-            Ap = hypre_StructMatrixBoxData(A, i, si);
-
+            //Ap = hypre_StructMatrixBoxData(A, i, si);
+            //Ap = data_A + indices_d[si];
+            Ap = hypre_StructGetMatrixBoxData(A, i, si);
             /* compute lambdax */
-            Astenc = hypre_IndexD(stencil_shape[si], 0);
+            //Astenc = hypre_IndexD(stencil_shape[si], 0);
+            //Astenc = stencil_shape_d[si];
+            Astenc = hypre_StructGetIndexD(stencil_shape[si], 0,stencil_shape_d[si]);
             if (Astenc == 0)
             {
                lambdax += Ap[Ai];
@@ -416,7 +419,10 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
             }
 
             /* compute lambday */
-            Astenc = hypre_IndexD(stencil_shape[si], 1);
+            //Astenc = hypre_IndexD(stencil_shape[si], 1);
+	    //Astenc = stencil_shape_d[stencil_size+si];
+	    Astenc = hypre_StructGetIndexD(stencil_shape[si], 1,stencil_shape_d[stencil_size+si]);
+	    
             if (Astenc == 0)
             {
                lambday += Ap[Ai];
@@ -427,7 +433,9 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
             }
 
             /* compute lambdaz */
-            Astenc = hypre_IndexD(stencil_shape[si], 2);
+            //Astenc = hypre_IndexD(stencil_shape[si], 2);
+	    //Astenc = stencil_shape_d[2*stencil_size+si];
+	    Astenc = hypre_StructGetIndexD(stencil_shape[si], 2,stencil_shape_d[2*stencil_size+si]);
             if (Astenc == 0)
             {
                lambdaz += Ap[Ai];
@@ -447,6 +455,8 @@ hypre_SparseMSGFilterSetup( hypre_StructMatrix *A,
          vzp[vi] = lambdaz / (lambdax + lambday + lambdaz);
       }
       hypre_BoxLoop2End(Ai, vi);
+
+      hypre_StructCleanIndexD();	  
    }
 
    return ierr;
@@ -472,9 +482,6 @@ hypre_SparseMSGFilter( hypre_StructVector *visit,
    hypre_Box             *e_dbox;
    hypre_Box             *v_dbox;
                         
-   HYPRE_Int              ei;
-   HYPRE_Int              vi;
-                        
    HYPRE_Real            *ep;
    HYPRE_Real            *vp;
                         
@@ -519,7 +526,7 @@ hypre_SparseMSGFilter( hypre_StructVector *visit,
                           e_dbox, start,  stride,  ei,
                           v_dbox, startv, stridev, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,vi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(ei, vi)
       {
diff --git a/src/struct_ls/sparse_msg_interp.c b/src/struct_ls/sparse_msg_interp.c
index e22b7fd..81a69a4 100644
--- a/src/struct_ls/sparse_msg_interp.c
+++ b/src/struct_ls/sparse_msg_interp.c
@@ -58,7 +58,7 @@ hypre_SparseMSGInterpSetup( void               *interp_vdata,
                             hypre_Index         stride,
                             hypre_Index         strideP       )
 {
-	hypre_SparseMSGInterpData   *interp_data = (hypre_SparseMSGInterpData   *)interp_vdata;
+   hypre_SparseMSGInterpData   *interp_data = (hypre_SparseMSGInterpData   *)interp_vdata;
 
    hypre_StructGrid       *grid;
    hypre_StructStencil    *stencil;
@@ -131,10 +131,6 @@ hypre_SparseMSGInterp( void               *interp_vdata,
    hypre_Box              *P_dbox;
    hypre_Box              *xc_dbox;
    hypre_Box              *e_dbox;
-                       
-   HYPRE_Int               Pi;
-   HYPRE_Int               xci;
-   HYPRE_Int               ei;
                          
    HYPRE_Real             *Pp0, *Pp1;
    HYPRE_Real             *xcp;
@@ -203,7 +199,7 @@ hypre_SparseMSGInterp( void               *interp_vdata,
                           e_dbox,  start,  stride,  ei,
                           xc_dbox, startc, stridec, xci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ei,xci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(ei, xci)
       {
@@ -263,7 +259,7 @@ hypre_SparseMSGInterp( void               *interp_vdata,
                                 P_dbox, startP, strideP, Pi,
                                 e_dbox, start,  stride,  ei);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Pi,ei) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(Pi, ei)
             {
diff --git a/src/struct_ls/sparse_msg_restrict.c b/src/struct_ls/sparse_msg_restrict.c
index aed3718..cad32dc 100644
--- a/src/struct_ls/sparse_msg_restrict.c
+++ b/src/struct_ls/sparse_msg_restrict.c
@@ -58,7 +58,7 @@ hypre_SparseMSGRestrictSetup( void               *restrict_vdata,
                               hypre_Index         stride,
                               hypre_Index         strideR         )
 {
-	hypre_SparseMSGRestrictData *restrict_data = (hypre_SparseMSGRestrictData *)restrict_vdata;
+   hypre_SparseMSGRestrictData *restrict_data = (hypre_SparseMSGRestrictData *)restrict_vdata;
 
    hypre_StructGrid       *grid;
    hypre_StructStencil    *stencil;
@@ -129,10 +129,6 @@ hypre_SparseMSGRestrict( void               *restrict_vdata,
    hypre_Box              *R_dbox;
    hypre_Box              *r_dbox;
    hypre_Box              *rc_dbox;
-                       
-   HYPRE_Int               Ri;
-   HYPRE_Int               ri;
-   HYPRE_Int               rci;
                          
    HYPRE_Real             *Rp0, *Rp1;
    HYPRE_Real             *rp, *rp0, *rp1;
@@ -231,7 +227,7 @@ hypre_SparseMSGRestrict( void               *restrict_vdata,
                                 r_dbox,  start,  stride,  ri,
                                 rc_dbox, startc, stridec, rci);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,Ri,ri,rci) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ri, ri, rci)
             {
diff --git a/src/struct_ls/sparse_msg_setup.c b/src/struct_ls/sparse_msg_setup.c
index 6a32a55..f12b1fb 100644
--- a/src/struct_ls/sparse_msg_setup.c
+++ b/src/struct_ls/sparse_msg_setup.c
@@ -10,8 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
 /******************************************************************************
  *
  *
@@ -24,23 +22,23 @@
 
 #define GRID 0
 
-#define hypre_SparseMSGSetCIndex(cdir, cindex) \
-{\
-   hypre_SetIndex3(cindex, 0, 0, 0);\
-   hypre_IndexD(cindex, cdir) = 0;\
-}
+#define hypre_SparseMSGSetCIndex(cdir, cindex)  \
+   {                                            \
+      hypre_SetIndex3(cindex, 0, 0, 0);         \
+      hypre_IndexD(cindex, cdir) = 0;           \
+   }
 
-#define hypre_SparseMSGSetFIndex(cdir, findex) \
-{\
-   hypre_SetIndex3(findex, 0, 0, 0);\
-   hypre_IndexD(findex, cdir) = 1;\
-}
+#define hypre_SparseMSGSetFIndex(cdir, findex)  \
+   {                                            \
+      hypre_SetIndex3(findex, 0, 0, 0);         \
+      hypre_IndexD(findex, cdir) = 1;           \
+   }
 
-#define hypre_SparseMSGSetStride(cdir, stride) \
-{\
-   hypre_SetIndex3(stride, 1, 1, 1);\
-   hypre_IndexD(stride, cdir) = 2;\
-}
+#define hypre_SparseMSGSetStride(cdir, stride)  \
+   {                                            \
+      hypre_SetIndex3(stride, 1, 1, 1);         \
+      hypre_IndexD(stride, cdir) = 2;           \
+   }
 
 /*--------------------------------------------------------------------------
  * hypre_SparseMSGSetup
@@ -52,7 +50,7 @@ hypre_SparseMSGSetup( void               *smsg_vdata,
                       hypre_StructVector *b,
                       hypre_StructVector *x          )
 {
-	hypre_SparseMSGData  *smsg_data = (hypre_SparseMSGData  *)smsg_vdata;
+   hypre_SparseMSGData  *smsg_data = (hypre_SparseMSGData  *)smsg_vdata;
 
    MPI_Comm              comm = (smsg_data -> comm);
                      
@@ -559,7 +557,7 @@ hypre_SparseMSGSetup( void               *smsg_vdata,
       }
    }
 
-   data = hypre_SharedCTAlloc(HYPRE_Real, data_size);
+   data = hypre_DeviceCTAlloc(HYPRE_Real,data_size);
    (smsg_data -> data) = data;
 
    hypre_StructVectorInitializeData(t_a[0], data);
diff --git a/src/struct_mv/_hypre_struct_mv.h b/src/struct_mv/_hypre_struct_mv.h
index c1d06d5..8c807ab 100644
--- a/src/struct_mv/_hypre_struct_mv.h
+++ b/src/struct_mv/_hypre_struct_mv.h
@@ -1,3 +1,2125 @@
+
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
+#ifndef hypre_STRUCT_MV_HEADER
+#define hypre_STRUCT_MV_HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "HYPRE_struct_mv.h"
+#include "_hypre_utilities.h"
+
+#if defined(HYPRE_USE_RAJA)
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+extern "C++" {
+#include <RAJA/RAJA.hxx>
+}
+using namespace RAJA;
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define BLOCKSIZE 256
+
+#if defined(HYPRE_MEMORY_GPU)
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+		HYPRE_Int *p = NULL; *p = 1;
+    }
+}
+
+#define hypre_exec_policy cuda_exec<BLOCKSIZE>
+#define hypre_reduce_policy  cuda_reduce_atomic<BLOCKSIZE>
+#define hypre_fence() \
+cudaError err = cudaGetLastError();\
+if ( cudaSuccess != err ) {\
+printf("\n ERROR zypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+}\
+AxCheckError(cudaDeviceSynchronize());
+
+#elif defined(HYPRE_USE_OPENMP)
+   #define hypre_exec_policy      omp_for_exec
+   #define hypre_reduce_policy omp_reduce
+   #define hypre_fence() 
+#elif defined(HYPRE_USING_OPENMP_ACC)
+   #define hypre_exec_policy      omp_parallel_for_acc
+   #define hypre_reduce_policy omp_acc_reduce
+#else 
+   #define hypre_exec_policy   seq_exec
+   #define hypre_reduce_policy seq_reduce
+   #define hypre_fence()
+#endif
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{									\
+   HYPRE_Int idx = idx_local;						\
+   local_idx  = idx % box.lsize0;					\
+   idx        = idx / box.lsize0;					\
+   i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+   local_idx  = idx % box.lsize1;					\
+   idx        = idx / box.lsize1;					\
+   i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+   local_idx  = idx % box.lsize2;					\
+   idx  = idx / box.lsize2;					\
+   i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+#define zypre_BoxLoopCUDAInit(ndim,loop_size)				\
+  HYPRE_Int hypre__tot = 1;						\
+  for (HYPRE_Int i = 0;i < ndim;i ++)					\
+      hypre__tot *= loop_size[i];
+
+
+#define zypre_BoxLoopCUDADeclare()										\
+	HYPRE_Int local_idx;												\
+	HYPRE_Int idx_local = idx;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)			\
+{									\
+   zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+   forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+   {
+
+
+#define zypre_newBoxLoop0End()					\
+	});											\
+	hypre_fence();      \
+}
+
+#define zypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;					\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];   	\
+	}								\
+	else								\
+	{								\
+	    databox##k.lsize2 = 1;					\
+	    databox##k.strides2 = 0;					\
+	    databox##k.bstart2  = 0;					\
+	    databox##k.bsize2   = 0;					\
+	}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+      zypre_BoxLoopCUDADeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      zypre_BoxLoopIncK(1,databox1,i1);
+
+      
+#define zypre_newBoxLoop1End(i1)				\
+	});											\
+    hypre_fence();\
+}
+	
+#define zypre_newBoxLoop2Begin(ndim, loop_size,				\
+                                dbox1, start1, stride1, i1,	\
+                                dbox2, start2, stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+		HYPRE_Int i1 = 0, i2 = 0;							\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	
+
+
+#define zypre_newBoxLoop2End(i1, i2)			\
+	});											\
+    hypre_fence();\
+}
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+  {									\
+  zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+        zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+        zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+        zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+        forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+	{								\
+	  zypre_BoxLoopCUDADeclare();					\
+	  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	  HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	  local_idx  = idx_local % databox1.lsize0;				\
+	  idx_local  = idx_local / databox1.lsize0;				\
+	  i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	  i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	  i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	  local_idx  = idx_local % databox1.lsize1;				\
+	  idx_local  = idx_local / databox1.lsize1;				\
+	  i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	  i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	  i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	  local_idx  = idx_local % databox1.lsize2;				\
+	  idx_local  = idx_local / databox1.lsize2;				\
+	  i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	  i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	  i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	  
+
+#define zypre_newBoxLoop3End(i1, i2, i3)			\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+ zypre_BoxLoopCUDAInit(ndim,loop_size);					       \
+     zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     zypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+     {									\
+         zypre_BoxLoopCUDADeclare();					\
+		 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+	 
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)	\
+  });						\
+  hypre_fence();				\
+}
+
+#define MAX_BLOCK BLOCKSIZE
+
+extern "C++" {
+#if defined(HYPRE_MEMORY_GPU)
+template<class T>
+class ReduceMult   
+{
+public:
+  /*!
+   * \brief Constructor takes initial reduction value (default constructor
+   * is disabled).
+   *
+   * Note: Constructor only executes on the host.
+   */
+  explicit ReduceMult(T init_val)
+  {
+    m_is_copy_host = false;
+    m_myID = getCudaReductionId();
+    getCudaReductionTallyBlock(m_myID,
+                               (void **)&m_tally_host,
+                               (void **)&m_tally_device);
+    m_tally_host->tally = init_val;
+  }
+
+  /*!
+   * \brief Initialize shared memory on device, request shared memory on host.
+   *
+   * Copy constructor executes on both host and device.
+   * On host requests dynamic shared memory and gets offset into dynamic
+   * shared memory if in forall.
+   * On device initializes dynamic shared memory to appropriate value.
+   */
+  RAJA_HOST_DEVICE
+  ReduceMult(const ReduceMult<T> &other)
+  {
+    *this = other;
+#if defined(__CUDA_ARCH__)
+    m_is_copy_device = true;
+    m_finish_reduction = !other.m_is_copy_device;
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    // initialize shared memory
+    T val = static_cast<T>(0);
+    for (HYPRE_Int i = BLOCKSIZE / 2; i > 0; i /= 2) {
+      // this descends all the way to 1
+      if (threadId < i) {
+        sd[threadId + i] = val;
+      }
+    }
+    if (threadId < 1) {
+      sd[threadId] = val;
+    }
+
+    __syncthreads();
+#else
+    m_is_copy_host = true;
+    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCKSIZE, sizeof(T));
+#endif
+  }
+
+  /*!
+   * \brief Finish reduction on device and free memory on host.
+   *
+   * Destruction on host releases the device memory chunk for
+   * reduction id and id itself for others to use.
+   * Destruction on device completes the reduction.
+   *
+   * Note: destructor executes on both host and device.
+   */
+  RAJA_HOST_DEVICE ~ReduceMult<T>()
+  {
+#if defined(__CUDA_ARCH__)
+    if (m_finish_reduction) {
+      extern __shared__ unsigned char sd_block[];
+      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+      HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                     + (blockDim.x * blockDim.y) * threadIdx.z;
+
+      T temp = 1;
+      __syncthreads();
+
+      for (HYPRE_Int i = BLOCKSIZE / 2; i >= WARP_SIZE; i /= 2) {
+        if (threadId < i) {
+          sd[threadId] *= sd[threadId + i];
+        }
+        __syncthreads();
+      }
+
+      if (threadId < WARP_SIZE) {
+        temp = sd[threadId];
+        for (HYPRE_Int i = WARP_SIZE / 2; i > 0; i /= 2) {
+          temp *= HIDDEN::shfl_xor<T>(temp, i);
+        }
+      }
+
+      // one thread adds to tally
+      if (threadId == 0) {
+        _atomicAdd<T>(&(m_tally_device->tally), temp);
+      }
+    }
+#else
+    if (!m_is_copy_host) {
+      releaseCudaReductionTallyBlock(m_myID);
+      releaseCudaReductionId(m_myID);
+    }
+#endif
+
+    
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  operator T()
+  {
+    beforeCudaReadTallyBlock<true>(m_myID);
+    return m_tally_host->tally;
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  T get() { return operator T(); }
+
+  /*!
+   * \brief Operator that adds value to sum.
+   *
+   * Note: only operates on device.
+   */
+  RAJA_DEVICE ReduceMult<T> const &
+  operator*=(T val) const
+  {
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    sd[threadId] *= val;
+
+    return *this;
+  }
+
+private:
+  /*!
+   * \brief Default constructor is declared private and not implemented.
+   */
+  ReduceMult<T>();
+
+  /*!
+   * \brief Pointer to host tally block cache slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
+
+  /*!
+   * \brief Pointer to device tally block slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+
+  /*!
+   * \brief My cuda reduction variable ID.
+   */
+  HYPRE_Int m_myID = -1;
+
+  /*!
+   * \brief Byte offset into dynamic shared memory.
+   */
+  HYPRE_Int m_smem_offset = -1;
+
+  /*!
+   * \brief If this variable is a copy or not; only original may release memory 
+   *        or perform finalization.
+   */
+  bool m_is_copy_host = false;
+  bool m_is_copy_device = false;
+  bool m_finish_reduction = false;
+
+  // Sanity checks for block size and template type size
+  static constexpr bool powerOfTwoCheck = (!(BLOCKSIZE & (BLOCKSIZE - 1)));
+  static constexpr bool reasonableRangeCheck =
+      ((BLOCKSIZE >= 32) && (BLOCKSIZE <= 1024));
+  static constexpr bool sizeofcheck =
+      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
+       && (sizeof(CudaReductionTallyType<T>)
+           <= sizeof(CudaReductionDummyTallyType))
+       && (sizeof(CudaReductionBlockType<T>)
+           <= sizeof(CudaReductionDummyBlockType)));
+  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
+  static_assert(reasonableRangeCheck,
+                "Error: block sizes must be between 32 and 1024");
+  static_assert(sizeofcheck,
+      "Error: type must be of size <= " 
+      RAJA_STRINGIFY_MACRO(RAJA_CUDA_REDUCE_VAR_MAXSIZE));
+};
+#elif defined(HYPRE_USING_OPENMP)
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_val, T initializer = 1)
+        : m_parent(NULL), m_val(init_val), m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+#pragma omp critical
+                {
+                    *m_parent *= m_val;
+                }
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum for current thread.
+        //
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+        
+    };
+#else
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_m_val, T initializer = 1) :
+        m_parent(NULL),
+        m_val(init_m_val),
+        m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+                *m_parent *= m_val;
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns reduced sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum.
+        //
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+    };
+#endif
+}
+
+
+#define zypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,sum) \
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1,i1)	\
+      {
+
+#define zypre_newBoxLoop1ReductionEnd(i1,sum)				\
+      }									\
+      zypre_newBoxLoop1End(i1);					\
+      hypre_fence();						\
+      sum_tmp = (HYPRE_Real)(sum);				\
+   }								\
+   sum += sum_tmp; \
+}
+		    
+#define zypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum)	\
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop2Begin(ndim, loop_size, \
+			     dbox1, start1, stride1,i1,\
+			     dbox2, start2, stride2,i2)	\
+      {
+
+#define zypre_newBoxLoop2ReductionEnd(i1,i2,sum)			\
+      }									\
+      zypre_newBoxLoop2End(i1,i2);					\
+      hypre_fence();							\
+      sum_tmp = (HYPRE_Real)(sum);					\
+   }								\
+   sum += sum_tmp; \
+}
+
+#define zypre_newBoxLoop1ReductionMult(ndim, loop_size,				\
+				       dbox1, start1, stride1, i1,xp,sum) \
+{									\
+   ReduceMult<HYPRE_Real> local_result_raja(1.0);				\
+   zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \
+   {									\
+       local_result_raja *= xp[i1];					\
+   }									\
+   zypre_newBoxLoop1End(i1)						\
+   hypre_fence();							\
+   sum *= (HYPRE_Real)(local_result_raja);				\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{									\
+   forall< hypre_exec_policy >(0, size, [=] RAJA_DEVICE (HYPRE_Int idx)	\
+   {
+
+#define hypre_LoopEnd()					\
+   });							\
+   hypre_fence();		\
+}
+  
+#define zypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);				\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int i1 = 0;							\
+        local_idx  = idx_local % databox1.lsize0;			\
+        idx_local  = idx_local / databox1.lsize0;			\
+        i1 += local_idx*databox1.strides0;				\
+        local_idx  = idx_local % databox1.lsize1;			\
+        idx_local  = idx_local / databox1.lsize1;			\
+        i1 += local_idx*databox1.strides1;				\
+        local_idx  = idx_local % databox1.lsize2;			\
+        idx_local  = idx_local / databox1.lsize2;			\
+        i1 += local_idx*databox1.strides2;				\
+		
+#define zypre_BoxBoundaryCopyEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define zypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define zypre_newBoxLoopSetOneBlock()
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_newBoxLoop0For
+#define hypre_BoxLoop0End        zypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_newBoxLoop1For
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_newBoxLoop2For
+#define hypre_BoxLoop2End        zypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_newBoxLoop3For
+#define hypre_BoxLoop3End        zypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_newBoxLoop4For
+#define hypre_BoxLoop4End        zypre_newBoxLoop4End
+
+#define hypre_newBoxLoop1ReductionBegin zypre_newBoxLoop1ReductionBegin
+#define hypre_newBoxLoop1ReductionEnd   zypre_newBoxLoop1ReductionEnd
+#define hypre_newBoxLoop2ReductionBegin zypre_newBoxLoop2ReductionBegin
+#define hypre_newBoxLoop2ReductionEnd   zypre_newBoxLoop2ReductionEnd
+#define hypre_newBoxLoop1ReductionMult zypre_newBoxLoop1ReductionMult
+#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+#endif
+#elif defined(HYPRE_USE_KOKKOS)
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+extern "C++" {
+#include <Kokkos_Core.hpp>
+}
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+ typedef struct hypre_Boxloop_struct
+ {
+	 HYPRE_Int lsize0,lsize1,lsize2;
+	 HYPRE_Int strides0,strides1,strides2;
+	 HYPRE_Int bstart0,bstart1,bstart2;
+	 HYPRE_Int bsize0,bsize1,bsize2;
+ } hypre_Boxloop;
+
+ #if defined(HYPRE_MEMORY_GPU)
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ #define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+ inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+ {
+     if (err)
+     {
+	 printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+     }
+ }
+ #define BLOCKSIZE 256
+
+ #define hypre_fence() \
+ cudaError err = cudaGetLastError();\
+ if ( cudaSuccess != err ) {\
+ printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+ }\
+ AxCheckError(cudaDeviceSynchronize());
+ #elif defined(HYPRE_USE_OPENMP)
+    #define hypre_fence() ;
+ #elif defined(HYPRE_USING_OPENMP_ACC)
+ #define hypre_fence()  
+ #else 
+    #define hypre_fence();
+ #endif
+
+ #define hypre_newBoxLoopInit(ndim,loop_size)					\
+	 HYPRE_Int hypre__tot = 1;											\
+	 for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		 hypre__tot *= loop_size[i];
+
+
+ #define hypre_BoxLoopIncK(k,box,i)					\
+ {									\
+    HYPRE_Int idx = idx_local;						\
+    local_idx  = idx % box.lsize0;					\
+    idx        = idx / box.lsize0;					\
+    i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+    local_idx  = idx % box.lsize1;					\
+    idx        = idx / box.lsize1;					\
+    i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+    local_idx  = idx % box.lsize2;					\
+    idx  = idx / box.lsize2;					\
+    i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+ }
+
+ #define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	 hypre_Boxloop databox##k;     					\
+	 databox##k.lsize0 = loop_size[0];				\
+	 databox##k.strides0 = stride[0];				\
+	 databox##k.bstart0  = start[0] - dbox->imin[0];		\
+	 databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	 if (ndim > 1)							\
+	 {								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	 }								\
+	 else						        	\
+	 {							       	\
+	    databox##k.lsize1 = 1;				       	\
+	    databox##k.strides1 = 0;					\
+	    databox##k.bstart1  = 0;					\
+	    databox##k.bsize1   = 0;					\
+	 }								\
+	 if (ndim == 3)							\
+	 {								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	 }								\
+	 else								\
+	 {								\
+	   databox##k.lsize2 = 1;					\
+	   databox##k.strides2 = 0;					\
+	   databox##k.bstart2  = 0;					\
+	   databox##k.bsize2   = 0;					\
+	 }
+
+ #define hypre_newBoxLoopDeclare()										\
+	 HYPRE_Int local_idx;												\
+	 HYPRE_Int idx_local = idx;
+
+ #define hypre_newBoxLoop0Begin(ndim, loop_size) 	\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {
+
+
+ #define hypre_newBoxLoop0End(i1)				\
+	 });											\
+ }
+
+
+ #define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1)		\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size)						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+       hypre_newBoxLoopDeclare();						\
+       HYPRE_Int hypre_boxD1 = 1;					\
+       HYPRE_Int i1 = 0;							\
+       local_idx  = idx_local % databox1.lsize0;				\
+       idx_local  = idx_local / databox1.lsize0;				\
+       i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+       local_idx  = idx_local % databox1.lsize1;				\
+       idx_local  = idx_local / databox1.lsize1;				\
+       i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+       local_idx  = idx_local % databox1.lsize2;				\
+       idx_local  = idx_local / databox1.lsize2;				\
+       i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+
+
+ #define hypre_newBoxLoop1End(i1)					\
+     });									\
+     hypre_fence();							\
+ }
+
+
+ #define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2)		\
+ {    														\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+ #define hypre_newBoxLoop2End(i1, i2)			\
+      });							\
+      hypre_fence();						\
+ }
+
+
+ #define hypre_newBoxLoop3Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3)		\
+ {																	\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+      hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+      hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+      hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+      Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+      {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	 local_idx  = idx_local % databox1.lsize0;				\
+	 idx_local  = idx_local / databox1.lsize0;				\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	 local_idx  = idx_local % databox1.lsize1;				\
+	 idx_local  = idx_local / databox1.lsize1;				\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	 local_idx  = idx_local % databox1.lsize2;				\
+	 idx_local  = idx_local / databox1.lsize2;				\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	 i3 += (local_idx*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);
+
+ #define hypre_newBoxLoop3End(i1, i2, i3)			\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop4Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3,		\
+				dbox4, start4, stride4, i4)		\
+ {									\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop4End(i1, i2, i3, i4)		\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+
+
+
+ #define hypre_newBoxLoop1ReductionEnd(i1, sum)				\
+     },sum);								\
+     hypre_fence();							\
+     sum += sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1,	\
+					 dbox2, start2, stride2, i2, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);				\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+     },sum);								\
+     hypre_fence();							\
+     sum +=sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop1ReductionMult(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, xp, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 1.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 sum *= xp[i1];							\
+     },sum);								\
+     hypre_fence();							\
+     sum *=sum_tmp;								\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+    Kokkos::parallel_for(size, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {
+
+#define hypre_LoopEnd()							\
+    });									\
+    hypre_fence();							\
+}
+  
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+       hypre__tot *= loop_size[d];					\
+    }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int i1 = 0;						\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+	});							\
+	hypre_fence();						\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define hypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+ 
+#define hypre_newBoxLoopSetOneBlock() {}
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock hypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+
+//#define hypre_newBoxLoop1ReductionBegin hypre_newBoxLoop1ReductionBegin
+//#define hypre_newBoxLoop1ReductionEnd   hypre_newBoxLoop1ReductionEnd
+//#define hypre_newBoxLoop2ReductionBegin hypre_newBoxLoop2ReductionBegin
+//#define hypre_newBoxLoop2ReductionEnd   hypre_newBoxLoop2ReductionEnd
+//#define hypre_newBoxLoop1ReductionMult hypre_newBoxLoop1ReductionMult
+//#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+//#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+//#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+//#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+
+#endif
+#elif defined(HYPRE_USE_CUDA)
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_traversal {HYPRE_Int cuda;};
+struct omp_traversal  {HYPRE_Int omp;};
+#define hypre_exec_policy cuda_traversal()
+#define HYPER_LAMBDA [=] __device__
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+    }
+}
+#define BLOCKSIZE 128
+
+#define hypre_fence() \
+  cudaError err = cudaGetLastError();		\
+if ( cudaSuccess != err )\
+{\
+   printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__);\
+}									\
+AxCheckError(cudaDeviceSynchronize());
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void forall_kernel(LOOP_BODY loop_body, HYPRE_Int length)
+{
+	HYPRE_Int idx = blockDim.x * blockIdx.x + threadIdx.x;
+	if (idx < length)
+		loop_body(idx);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (cuda_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{	
+	size_t const blockSize = 128;
+	size_t gridSize  = (length + blockSize - 1) / blockSize;
+	if (gridSize == 0) gridSize = 1;
+	
+	//hypre_printf("length= %d, blocksize = %d, gridsize = %d\n",length,blockSize,gridSize);
+	forall_kernel<<<gridSize, blockSize>>>(loop_body,length);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (omp_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{
+
+#pragma omp parallel for schedule(static)
+	for (HYPRE_Int idx = 0;idx < length;idx++)
+		loop_body(idx);
+}
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{       								\
+HYPRE_Int idx = idx_local;						\
+local_idx  = idx % box.lsize0;					\
+idx        = idx / box.lsize0;					\
+i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+local_idx  = idx % box.lsize1;					\
+idx        = idx / box.lsize1;					\
+i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+local_idx  = idx % box.lsize2;					\
+idx  = idx / box.lsize2;					\
+i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+template<class T>
+__global__ void reduction_mult (T * a, T * b, HYPRE_Int hypre__tot,
+				hypre_Boxloop box1)
+{
+    HYPRE_Int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local = id;
+    HYPRE_Int hypre_boxD1 = 1;
+    HYPRE_Int i1 = 0;
+    //// reducted output
+    __shared__ T shared_cache [BLOCKSIZE];
+    T sum = 1;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    if (id < hypre__tot)
+      sum = a[i1];
+    *(shared_cache + threadIdx.x) = sum;
+    
+    __syncthreads();
+    
+    ///////// sum of internal cache
+    
+    HYPRE_Int i;    
+    
+    for (i=(BLOCKSIZE /2); i>0 ; i= i/2){
+      if (threadIdx.x < i){
+	*(shared_cache + threadIdx.x) *= *(shared_cache + threadIdx.x + i);
+      }
+      __syncthreads();
+    }
+    
+    if ( threadIdx.x == 0){
+      *(b+ blockIdx.x) = shared_cache[0];
+    }
+}
+}
+
+#define hypre_BoxLoopInit(ndim,loop_size)					\
+	HYPRE_Int hypre__tot = 1;											\
+	for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		hypre__tot *= loop_size[i];
+
+
+#define hypre_newBoxLoopDeclare()\
+	HYPRE_Int hypre__i,hypre__j,hypre__k;\
+	HYPRE_Int idx_local = idx;
+
+#define hypre_newBoxLoop0Begin(ndim, loop_size)				\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {
+
+#define hypre_newBoxLoop0End()					\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;											\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{							      	\
+	      databox##k.lsize2 = loop_size[2];				\
+	      databox##k.strides2 = stride[2];				\
+	      databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	      databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	}				                        	\
+	else						        	\
+	{							       	\
+		databox##k.lsize2 = 1;				       	\
+		databox##k.strides2 = 0;		       		\
+		databox##k.bstart2  = 0;	       			\
+		databox##k.bsize2   = 0;		       		\
+	}
+
+#define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);				\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+      hypre_newBoxLoopDeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      hypre__i  = idx_local % databox1.lsize0;				\
+      idx_local = idx_local / databox1.lsize0;				\
+      i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+      hypre__j  = idx_local % databox1.lsize1;				\
+      idx_local = idx_local / databox1.lsize1;				\
+      i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+      hypre__k  = idx_local % databox1.lsize2;				\
+      idx_local = idx_local / databox1.lsize2;				\
+      i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+      
+#define hypre_newBoxLoop1End(i1)				\
+    });									\
+    hypre_fence();							\
+}
+	
+#define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+#define hypre_newBoxLoop2End(i1, i2)			\
+    });							\
+    hypre_fence();					\
+}
+
+#define hypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	hypre__i  = idx_local % databox1.lsize0;				\
+	idx_local  = idx_local / databox1.lsize0;				\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	hypre__j   = idx_local % databox1.lsize1;				\
+	idx_local  = idx_local / databox1.lsize1;				\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	hypre__k  = idx_local % databox1.lsize2;				\
+	idx_local  = idx_local / databox1.lsize2;				\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	i3 += (hypre__k*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	
+
+#define hypre_newBoxLoop3End(i1, i2,i3)			\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+     hypre_BoxLoopInit(ndim,loop_size);			       \
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+     {									\
+        hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	i4 += (hypre__i*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	i4 += (hypre__j*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	i3 += (hypre__k*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	i4 += (hypre__k*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+		
+#define hypre_newBoxLoop4End(i1, i2, i3, i4)	\
+    });						\
+    hypre_fence();				\
+}
+
+#define MAX_BLOCK 512
+
+extern "C++" {
+template<class T>
+__inline__ __device__
+HYPRE_Int fake_shfl_down(T val, HYPRE_Int offset, HYPRE_Int width=32) {
+  static __shared__ T shared[MAX_BLOCK];
+  HYPRE_Int lane=threadIdx.x%32;
+
+  shared[threadIdx.x]=val;
+  __syncthreads();
+
+  val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
+  __syncthreads();
+
+  return val;
+}
+
+template<class T>  
+__inline__ __device__
+HYPRE_Real warpReduceSum (T val) {
+  for (HYPRE_Int offset = warpSize/2; offset > 0; offset /= 2)
+    val += __shfl_down(val,offset);
+  return val;
+}
+
+
+template<class T> 
+__inline__ __device__
+HYPRE_Real blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  HYPRE_Int lane=threadIdx.x%warpSize;
+  HYPRE_Int wid=threadIdx.x/warpSize;
+  val=warpReduceSum<T>(val);
+
+  //write reduced value to shared memory
+  if(lane==0) shared[wid]=val;
+  __syncthreads();
+
+  //ensure we only grab a value from shared memory if that warp existed
+  val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : HYPRE_Int(0);
+  if(wid==0) val=warpReduceSum<T>(val);
+
+  return val;
+}
+
+template<class T>
+__global__ void hypre_device_reduce_stable_kernel(T*a, T*b, T* out, HYPRE_Int N,
+						  hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int local_idx;
+  HYPRE_Int idx_local;
+  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+  HYPRE_Int i1 = 0, i2 = 0;
+  T sum=T(0);
+  HYPRE_Int i;
+  
+  for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+  {
+    idx_local = i;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+    i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+    hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+    sum += a[i1] * hypre_conj(b[i2]);
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>       
+__global__ void hypre_device_reduce_stable_kernel2(T *in, T* out, HYPRE_Int N) {
+  T sum=T(0);
+  for(HYPRE_Int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>   
+void hypre_device_reduce_stable(T*a,T*b, T* out, HYPRE_Int N,
+				hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduce_stable_kernel<T><<<blocks,threads>>>(a,b,out,N,box1,box2);
+  hypre_device_reduce_stable_kernel2<T><<<1,1024>>>(out,out,blocks); 
+}
+
+}
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void hypre_device_reduction_kernel(HYPRE_Real* out,
+					      HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+					      LOOP_BODY loop_body)
+{
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local;
+    HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+    HYPRE_Int i1 = 0, i2 = 0;
+    HYPRE_Real sum = HYPRE_Real(0);
+    HYPRE_Int i;
+    
+    for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+      {
+	idx_local = i;
+	local_idx  = idx_local % box1.lsize0;
+	idx_local  = idx_local / box1.lsize0;
+	i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+	i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+	local_idx  = idx_local % box1.lsize1;
+	idx_local  = idx_local / box1.lsize1;
+	i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+	i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+	hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+	local_idx  = idx_local % box1.lsize2;	      
+	idx_local  = idx_local / box1.lsize2;		      
+	i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+	i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+	sum = loop_body(i1,i2,sum);
+      }
+    sum=blockReduceSum<HYPRE_Real>(sum);
+    if(threadIdx.x==0)
+      out[blockIdx.x]=sum;
+}
+
+template<typename LOOP_BODY>
+void hypre_device_reduction (HYPRE_Real* out,
+			     HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+			     LOOP_BODY loop_body)
+{	
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduction_kernel<<<blocks,threads>>>(out,N,box1,box2,loop_body);
+  hypre_device_reduce_stable_kernel2<HYPRE_Real><<<1,1024>>>(out,out,blocks);
+
+}
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox1,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)			\
+       return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox2,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+      return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+
+
+#define hypre_newBoxLoop1ReductionMult(ndim, loop_size,			  \
+				       dbox1, start1, stride1, i1,xp,sum) \
+{    									  \
+   HYPRE_Real sum_old = sum;\
+   sum = 1.0;\
+   hypre_BoxLoopInit(ndim,loop_size);				  \
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	  \
+   HYPRE_Int n_blocks = (hypre__tot+BLOCKSIZE-1)/BLOCKSIZE;		  \
+   HYPRE_Real *d_b;				        		  \
+   HYPRE_Real * b = new HYPRE_Real[n_blocks];				  \
+   cudaMalloc((void**) &d_b, n_blocks * sizeof(HYPRE_Real));			\
+   reduction_mult<HYPRE_Real><<< n_blocks ,BLOCKSIZE>>>(xp,d_b,hypre__tot,databox1);		\
+   hypre_fence();							\
+   for (HYPRE_Int j = 0 ; j< n_blocks ; ++j){								\
+     sum *= b[j];							\
+   }	  		\
+   delete [] b;		\
+   sum *=sum_old;\
+}
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+	BoxLoopforall(hypre_exec_policy,size,HYPER_LAMBDA (HYPRE_Int idx) \
+	{
+
+#define hypre_LoopEnd()					\
+	});											\
+        hypre_fence();\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	    hypre_newBoxLoopDeclare()											\
+	    HYPRE_Int i1 = 0;											\
+	    hypre__i  = idx_local % databox1.lsize0;			\
+	    idx_local  = idx_local / databox1.lsize0;			\
+	    i1 += hypre__i*databox1.strides0;				\
+	    hypre__j  = idx_local % databox1.lsize1;			\
+	    idx_local  = idx_local / databox1.lsize1;			\
+	    i1 += hypre__j*databox1.strides1;				\
+	    hypre__k  = idx_local % databox1.lsize2;			\
+	    idx_local  = idx_local / databox1.lsize2;			\
+	    i1 += hypre__k*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];									\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];									\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += hypre__i*databox1.strides0;				\
+	i2 += hypre__i*databox2.strides0;				\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += hypre__j*databox1.strides1;				\
+	i2 += hypre__j*databox2.strides1;				\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += hypre__k*databox1.strides2;				\
+	i2 += hypre__k*databox2.strides2;
+
+
+#define hypre_BoxDataExchangeEnd()				\
+     });								\
+     hypre_fence();							\
+}
+  
+#define hypre_newBoxLoop0For()
+
+#define hypre_newBoxLoop1For(i1)
+
+#define hypre_newBoxLoop2For(i1, i2) 
+ 
+#define hypre_newBoxLoop3For(i1, i2, i3)
+
+#define hypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopSetOneBlock() ; 
+#define hypre_BoxLoopBlock()       0
+
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+#endif
+#else
 /*BHEADER**********************************************************************
  * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
  * Produced at the Lawrence Livermore National Laboratory.
@@ -10,20 +2132,382 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-#ifndef hypre_STRUCT_MV_HEADER
-#define hypre_STRUCT_MV_HEADER
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#ifdef HYPRE_USING_OPENMP
+#define Pragma(x) _Pragma(#x)
+#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_SMP_SCHEDULE)
+#define OMPREDUCTION() Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#else
+#define OMP1
+#define OMPREDUCTION() ;
+#endif
+
+typedef struct hypre_Boxloop_struct
+  {
+    HYPRE_Int lsize0,lsize1,lsize2;
+    HYPRE_Int strides0,strides1,strides2;
+    HYPRE_Int bstart0,bstart1,bstart2;
+    HYPRE_Int bsize0,bsize1,bsize2;
+  }hypre_Boxloop;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)				\
+{\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop0End()\
+         }\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+                               dbox1, start1, stride1, i1)              \
+	{														\
+	zypre_BoxLoopDeclare();									\
+	zypre_BoxLoopDeclareK(1);								\
+	zypre_BoxLoopInit(ndim, loop_size);						\
+	zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);					\
+	OMP1\
+	for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+	{																	\
+		zypre_BoxLoopSet();												\
+		zypre_BoxLoopSetK(1, i1);										\
+		for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)			\
+		{																\
+			for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)		\
+			{
+
+#define zypre_newBoxLoop1End(i1)				\
+	             i1 += hypre__i0inc1;						\
+		    }											\
+			zypre_BoxLoopInc1();					\
+	        i1 += hypre__ikinc1[hypre__d];				\
+	        zypre_BoxLoopInc2();						\
+		}											\
+	}											\
+}
+
+
+#define zypre_newBoxLoop2Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2)	\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2,	\
+							   dbox3, start3, stride3, i3)	\
+{														\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopDeclareK(1);								\
+   zypre_BoxLoopDeclareK(2);								\
+   zypre_BoxLoopDeclareK(3);								\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);		\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);		\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);		\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop3End(i1, i2, i3)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,\
+                            dbox1, start1, stride1, i1,\
+                            dbox2, start2, stride2, i2,\
+                            dbox3, start3, stride3, i3,\
+                            dbox4, start4, stride4, i4)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopDeclareK(3);\
+   zypre_BoxLoopDeclareK(4);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);\
+   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      zypre_BoxLoopSetK(4, i4);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+            i4 += hypre__i0inc4;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         i4 += hypre__ikinc4[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+                                        sum)				\
+{									\
+   zypre_BoxLoopDeclare();						\
+   zypre_BoxLoopDeclareK(1);						\
+   zypre_BoxLoopInit(ndim, loop_size);					\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);			\
+   OMPREDUCTION()							\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)\
+            i1 += hypre__i0inc1;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
 
-#include "HYPRE_struct_mv.h"
-#include "_hypre_utilities.h"
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,				\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,	\
+                                        sum)							\
+{\
+   HYPRE_Int i1,i2;				\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMPREDUCTION()														\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_LoopBegin(size,idx)			\
+{									\
+   HYPRE_Int idx;							\
+   for (idx = 0;idx < size;idx ++)					\
+   {
+
+#define hypre_LoopEnd()					\
+  }							\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    HYPRE_Int hypre__tot = 1;						\
+    hypre_Boxloop databox1;						\
+    HYPRE_Int d,idx;							\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (d = 0;d < ndim;d ++)						\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    for (idx = 0;idx < hypre__tot;idx++)				\
+      {									\
+	  HYPRE_Int local_idx;						\
+	  HYPRE_Int idx_local = idx;					\
+	  HYPRE_Int i1 = 0;						\
+	  local_idx  = idx_local % databox1.lsize0;			\
+	  idx_local  = idx_local / databox1.lsize0;			\
+	  i1 += local_idx*databox1.strides0;				\
+	  local_idx  = idx_local % databox1.lsize1;			\
+	  idx_local  = idx_local / databox1.lsize1;			\
+	  i1 += local_idx*databox1.strides1;				\
+	  local_idx  = idx_local % databox1.lsize2;			\
+	  idx_local  = idx_local / databox1.lsize2;			\
+	  i1 += local_idx*databox1.strides2;				\
+
+
+#define hypre_BoxBoundaryCopyEnd()					\
+  }									\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,			\
+                                   stride1, i1,				\
+                                   stride2, i2)				\
+{									\
+   HYPRE_Int hypre__tot = 1,idx;					\
+   hypre_Boxloop databox1,databox2;					\
+   HYPRE_Int d;								\
+   databox1.lsize0 = loop_size[0];					\
+   databox1.lsize1 = loop_size[1];					\
+   databox1.lsize2 = loop_size[2];					\
+   databox1.strides0 = stride1[0];					\
+   databox1.strides1 = stride1[1];					\
+   databox1.strides2 = stride1[2];					\
+   databox2.lsize0 = loop_size[0];					\
+   databox2.lsize1 = loop_size[1];					\
+   databox2.lsize2 = loop_size[2];					\
+   databox2.strides0 = stride2[0];					\
+   databox2.strides1 = stride2[1];					\
+   databox2.strides2 = stride2[2];					\
+   for (d = 0;d < ndim;d ++)						\
+   {									\
+      hypre__tot *= loop_size[d];					\
+   }									\
+   for (idx = 0;idx < hypre__tot;idx++)					\
+   {									\
+      HYPRE_Int local_idx;						\
+      HYPRE_Int idx_local = idx;					\
+      HYPRE_Int i1 = 0, i2 = 0;						\
+      local_idx  = idx_local % databox1.lsize0;				\
+      idx_local  = idx_local / databox1.lsize0;				\
+      i1 += local_idx*databox1.strides0;				\
+      i2 += local_idx*databox2.strides0;				\
+      local_idx  = idx_local % databox1.lsize1;				\
+      idx_local  = idx_local / databox1.lsize1;				\
+      i1 += local_idx*databox1.strides1;				\
+      i2 += local_idx*databox2.strides1;				\
+      local_idx  = idx_local % databox1.lsize2;				\
+      idx_local  = idx_local / databox1.lsize2;				\
+      i1 += local_idx*databox1.strides2;				\
+      i2 += local_idx*databox2.strides2;
+
+#define hypre_BoxDataExchangeEnd()					\
+   }                                                                    \
+}
 
+#define hypre_newBoxLoopGetIndex zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
+#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
+#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_BoxLoop0For
+#define hypre_BoxLoop0End        zypre_BoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_BoxLoop1For
+#define hypre_BoxLoop1End        zypre_BoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_BoxLoop2For
+#define hypre_BoxLoop2End        zypre_BoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_BoxLoop3For
+#define hypre_BoxLoop3End        zypre_BoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_BoxLoop4For
+#define hypre_BoxLoop4End        zypre_BoxLoop4End
+#define hypre_BasicBoxLoop2Begin zypre_BasicBoxLoop2Begin
+
+#endif
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
-
 /*BHEADER**********************************************************************
  * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
  * Produced at the Lawrence Livermore National Laboratory.
@@ -196,329 +2680,155 @@ for (i = 0; i < hypre_BoxArrayArraySize(box_array_array); i++)
  * BoxLoop macros:
  *--------------------------------------------------------------------------*/
 
-#if 0 /* set to 0 to use the new box loops */
-
-#define HYPRE_BOX_PRIVATE hypre__nx,hypre__ny,hypre__nz,hypre__i,hypre__j,hypre__k
-
-#define hypre_BoxLoopDeclareS(dbox, stride, sx, sy, sz) \
-HYPRE_Int  sx = (hypre_IndexX(stride));\
-HYPRE_Int  sy = (hypre_IndexY(stride)*hypre_BoxSizeX(dbox));\
-HYPRE_Int  sz = (hypre_IndexZ(stride)*\
-           hypre_BoxSizeX(dbox)*hypre_BoxSizeY(dbox))
-
-#define hypre_BoxLoopDeclareN(loop_size) \
-HYPRE_Int  hypre__i, hypre__j, hypre__k;\
-HYPRE_Int  hypre__nx = hypre_IndexX(loop_size);\
-HYPRE_Int  hypre__ny = hypre_IndexY(loop_size);\
-HYPRE_Int  hypre__nz = hypre_IndexZ(loop_size);\
-HYPRE_Int  hypre__mx = hypre__nx;\
-HYPRE_Int  hypre__my = hypre__ny;\
-HYPRE_Int  hypre__mz = hypre__nz;\
-HYPRE_Int  hypre__dir, hypre__max;\
-HYPRE_Int  hypre__div, hypre__mod;\
-HYPRE_Int  hypre__block, hypre__num_blocks;\
-hypre__dir = 0;\
-hypre__max = hypre__nx;\
-if (hypre__ny > hypre__max)\
-{\
-   hypre__dir = 1;\
-   hypre__max = hypre__ny;\
-}\
-if (hypre__nz > hypre__max)\
-{\
-   hypre__dir = 2;\
-   hypre__max = hypre__nz;\
-}\
-hypre__num_blocks = hypre_NumThreads();\
-if (hypre__max < hypre__num_blocks)\
+#ifdef HYPRE_USE_RAJA
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+//ReduceSum< cuda_reduce<BLOCKSIZE>, HYPRE_Real> local_result(0.0);
+#else
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+#endif
+
+#if defined(HYPRE_MEMORY_GPU)
+
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)\
+HYPRE_Int * indices_d;\
+HYPRE_Int indices_h[stencil_size];\
+HYPRE_Int * stencil_shape_d;\
+HYPRE_Int  stencil_shape_h[size*stencil_size];\
+HYPRE_Complex * data_A = hypre_StructMatrixData(A);\
+indices_d = hypre_DeviceTAlloc(HYPRE_Int, stencil_size);\
+stencil_shape_d = hypre_DeviceTAlloc(HYPRE_Int, size*stencil_size);\
+for (HYPRE_Int ii = 0; ii < stencil_size; ii++)\
 {\
-   hypre__num_blocks = hypre__max;\
+   HYPRE_Int jj = 0;\
+   indices_h[ii]       = hypre_StructMatrixDataIndices(A)[i][ii];\
+   if (size > 1) cdir = 0;\
+   stencil_shape_h[ii] = hypre_IndexD(stencil_shape[ii], cdir);\
+   for (jj = 1;jj < size;jj++)\
+      stencil_shape_h[jj*stencil_size+ii] = hypre_IndexD(stencil_shape[ii], jj);\
 }\
-if (hypre__num_blocks > 0)\
-{\
-   hypre__div = hypre__max / hypre__num_blocks;\
-   hypre__mod = hypre__max % hypre__num_blocks;\
-}
-
-#define hypre_BoxLoopSet(i, j, k) \
-i = 0;\
-j = 0;\
-k = 0;\
-hypre__nx = hypre__mx;\
-hypre__ny = hypre__my;\
-hypre__nz = hypre__mz;\
-if (hypre__num_blocks > 1)\
-{\
-   if (hypre__dir == 0)\
-   {\
-      i = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 1)\
-   {\
-      j = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 2)\
-   {\
-      k = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-}
-
-#define hypre_BoxLoopGetIndex(index) \
-index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
-
-/* Use this before the For macros below to force only one block */
-#define hypre_BoxLoopSetOneBlock() hypre__num_blocks = 1
+hypre_DataCopyToData(indices_h,indices_d,HYPRE_Int,stencil_size);\
+hypre_DataCopyToData(stencil_shape_h,stencil_shape_d,HYPRE_Int,size*stencil_size);\
 
-/* Use this to get the block iteration inside a BoxLoop */
-#define hypre_BoxLoopBlock() hypre__block
+#define hypre_StructGetMatrixBoxData(A, i, si)  (data_A + indices_d[si])
 
-/*-----------------------------------*/
+#define hypre_StructGetIndexD(index,i,index_d) (index_d)
 
-#define hypre_BoxLoop0Begin(ndim, loop_size)\
-{\
-   hypre_BoxLoopDeclareN(loop_size);
+#define hypre_StructCleanIndexD()\
+hypre_DeviceTFree(indices_d);\
+hypre_DeviceTFree(stencil_shape_d);
 
-#define hypre_BoxLoop0For()\
-   hypre__BoxLoop0For(hypre__i, hypre__j, hypre__k)
-#define hypre__BoxLoop0For(i, j, k)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
+#define hypre_StructPreparePrint()\
+HYPRE_Int tot_size = num_values*hypre_BoxVolume(hypre_BoxArrayBox(data_space, hypre_BoxArraySize(box_array)-1));\
+data_host = hypre_CTAlloc(HYPRE_Complex, tot_size);\
+hypre_DataCopyFromData(data_host,data,HYPRE_Complex,tot_size);
 
-#define hypre_BoxLoop0End()\
-         }\
-      }\
-   }\
-   }\
-}
-  
-/*-----------------------------------*/
+#define hypre_StructPostPrint() hypre_TFree(data_host)
 
-#define hypre_BoxLoop1Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1)\
-{\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareN(loop_size);
+#else
 
-#define hypre_BoxLoop1For(i1)\
-   hypre__BoxLoop1For(hypre__i, hypre__j, hypre__k, i1)
-#define hypre__BoxLoop1For(i, j, k, i1)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)
+#define hypre_StructGetMatrixBoxData(A, i, si) hypre_StructMatrixBoxData(A,i,si)
+#define hypre_StructGetIndexD(index,i,index_d) hypre_IndexD(index,i)
+#define hypre_StructCleanIndexD() {;}
+#define hypre_StructPreparePrint() data_host = data;
+#define hypre_StructPostPrint() {;}
 
-#define hypre_BoxLoop1End(i1)\
-            i1 += hypre__sx1;\
-         }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-      }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-   }\
-   }\
-}
+#endif
   
-/*-----------------------------------*/
-
-#define hypre_BoxLoop2Begin(ndim,loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2)\
+#define hypre_SerialBoxLoop0Begin(ndim, loop_size)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop2For(i1, i2)\
-   hypre__BoxLoop2For(hypre__i, hypre__j, hypre__k, i1, i2)
-#define hypre__BoxLoop2For(i, j, k, i1, i2)\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   hypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop2End(i1, i2)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
+#define hypre_SerialBoxLoop0End()\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop3Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3)\
+#define hypre_SerialBoxLoop1Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop3For(i1, i2, i3)\
-   hypre__BoxLoop3For(hypre__i, hypre__j, hypre__k, i1, i2, i3)
-#define hypre__BoxLoop3For(i, j, k, i1, i2, i3)\
+   HYPRE_Int i1;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop3End(i1, i2, i3)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
+#define hypre_SerialBoxLoop1End(i1)\
+            i1 += hypre__i0inc1;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop4Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3,\
-                            dbox4, start4, stride4, i4)\
+#define hypre_SerialBoxLoop2Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1,\
+                                  dbox2, start2, stride2, i2)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   HYPRE_Int  hypre__i4start = hypre_BoxIndexRank(dbox4, start4);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareS(dbox4, stride4, hypre__sx4, hypre__sy4, hypre__sz4);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop4For(i1, i2, i3, i4)\
-   hypre__BoxLoop4For(hypre__i, hypre__j, hypre__k, i1, i2, i3, i4)
-#define hypre__BoxLoop4For(i, j, k, i1, i2, i3, i4)\
+   HYPRE_Int i1,i2;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   i4 = hypre__i4start + i*hypre__sx4 + j*hypre__sy4 + k*hypre__sz4;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop4End(i1, i2, i3, i4)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
-            i4 += hypre__sx4;\
+#define hypre_SerialBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
-         i4 += hypre__sy4 - hypre__nx*hypre__sx4;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-      i4 += hypre__sz4 - hypre__ny*hypre__sy4;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
+#if defined (HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)
+#define HYPRE_BOX_PRIVATE hypre__global_error
 #else
-
-#define HYPRE_BOX_PRIVATE        ZYPRE_BOX_PRIVATE
-
-#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
-#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
-#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
-#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
-#define hypre_BoxLoop0For        zypre_BoxLoop0For
-#define hypre_BoxLoop0End        zypre_BoxLoop0End
-#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
-#define hypre_BoxLoop1For        zypre_BoxLoop1For
-#define hypre_BoxLoop1End        zypre_BoxLoop1End
-#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
-#define hypre_BoxLoop2For        zypre_BoxLoop2For
-#define hypre_BoxLoop2End        zypre_BoxLoop2End
-#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
-#define hypre_BoxLoop3For        zypre_BoxLoop3For
-#define hypre_BoxLoop3End        zypre_BoxLoop3End
-#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
-#define hypre_BoxLoop4For        zypre_BoxLoop4For
-#define hypre_BoxLoop4End        zypre_BoxLoop4End
-
-#endif /* end if 1 */
-
+#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE
 #endif
-
-/******************************************************************************
- *
- * NEW BoxLoop STUFF
- *
- *****************************************************************************/
-
-#ifndef hypre_ZBOX_HEADER
-#define hypre_ZBOX_HEADER
-
 #define ZYPRE_BOX_PRIVATE hypre__IN,hypre__JN,hypre__I,hypre__J,hypre__d,hypre__i
 
-/*--------------------------------------------------------------------------
- * BoxLoop macros:
- *--------------------------------------------------------------------------*/
-
 #define zypre_BoxLoopDeclare() \
 HYPRE_Int  hypre__tot, hypre__div, hypre__mod;\
 HYPRE_Int  hypre__block, hypre__num_blocks;\
@@ -655,6 +2965,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop1Begin(ndim, loop_size,\
                             dbox1, start1, stride1, i1)\
 {\
+   HYPRE_Int i1;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopInit(ndim, loop_size);\
@@ -663,6 +2974,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop1For(i1)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
@@ -686,6 +2998,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
                             dbox1, start1, stride1, i1,\
                             dbox2, start2, stride2, i2)\
 {\
+   HYPRE_Int i1,i2;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -696,6 +3009,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop2For(i1, i2)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -723,6 +3037,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
                             dbox2, start2, stride2, i2,\
                             dbox3, start3, stride3, i3)\
 {\
+   HYPRE_Int i1,i2,i3;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -735,6 +3050,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop3For(i1, i2, i3)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -766,6 +3082,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
                             dbox3, start3, stride3, i3,\
                             dbox4, start4, stride4, i4)\
 {\
+   HYPRE_Int i1,i2,i3,i4;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -780,6 +3097,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop4For(i1, i2, i3, i4)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3,i4;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -808,6 +3126,32 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 
 /*-----------------------------------*/
 
+#define zypre_BasicBoxLoopInitK(k, stridek) \
+hypre__sk##k[0] = stridek[0];\
+hypre__ikinc##k[0] = 0;\
+for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
+{\
+   hypre__sk##k[hypre__d] = stridek[hypre__d];\
+   hypre__ikinc##k[hypre__d] = hypre__ikinc##k[hypre__d-1] +\
+      hypre__sk##k[hypre__d] - hypre__n[hypre__d-1]*hypre__sk##k[hypre__d-1];\
+}\
+hypre__i0inc##k = hypre__sk##k[0];\
+hypre__ikinc##k[hypre__ndim] = 0;\
+hypre__ikstart##k = 0
+
+#define zypre_BasicBoxLoop2Begin(ndim, loop_size,\
+                                 stride1, i1,\
+                                 stride2, i2)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BasicBoxLoopInitK(1, stride1);\
+   zypre_BasicBoxLoopInitK(2, stride2);
+
+/*-----------------------------------*/
+
 #endif
 
 
@@ -1469,6 +3813,9 @@ typedef struct hypre_CommHandle_struct
    HYPRE_Complex **send_buffers;
    HYPRE_Complex **recv_buffers;
 
+   HYPRE_Complex      **send_buffers_data;
+   HYPRE_Complex      **recv_buffers_data;
+	
    /* set = 0, add = 1 */
    HYPRE_Int       action;
 
@@ -1576,6 +3923,8 @@ typedef struct hypre_CommHandle_struct
 #define hypre_CommHandleSendBuffers(comm_handle) (comm_handle -> send_buffers)
 #define hypre_CommHandleRecvBuffers(comm_handle) (comm_handle -> recv_buffers)
 #define hypre_CommHandleAction(comm_handle)      (comm_handle -> action)
+#define hypre_CommHandleSendBuffersDevice(comm_handle)    (comm_handle -> send_buffers_data)
+#define hypre_CommHandleRecvBuffersDevice(comm_handle)    (comm_handle -> recv_buffers_data)
 
 #endif
 /*BHEADER**********************************************************************
@@ -2028,7 +4377,6 @@ HYPRE_Int HYPRE_StructVectorSetConstantValues ( HYPRE_StructVector vector , HYPR
 HYPRE_Int HYPRE_StructVectorGetMigrateCommPkg ( HYPRE_StructVector from_vector , HYPRE_StructVector to_vector , HYPRE_CommPkg *comm_pkg );
 HYPRE_Int HYPRE_StructVectorMigrate ( HYPRE_CommPkg comm_pkg , HYPRE_StructVector from_vector , HYPRE_StructVector to_vector );
 HYPRE_Int HYPRE_CommPkgDestroy ( HYPRE_CommPkg comm_pkg );
-HYPRE_Int HYPRE_StructVectorClone ( HYPRE_StructVector x, HYPRE_StructVector *y_ptr );
 
 /* project.c */
 HYPRE_Int hypre_ProjectBox ( hypre_Box *box , hypre_Index index , hypre_Index stride );
@@ -2141,7 +4489,7 @@ HYPRE_Int hypre_StructVectorSetNumGhost ( hypre_StructVector *vector , HYPRE_Int
 HYPRE_Int hypre_StructVectorAssemble ( hypre_StructVector *vector );
 HYPRE_Int hypre_StructVectorCopy ( hypre_StructVector *x , hypre_StructVector *y );
 HYPRE_Int hypre_StructVectorSetConstantValues ( hypre_StructVector *vector , HYPRE_Complex values );
-HYPRE_Int hypre_StructVectorSetFunctionValues ( hypre_StructVector *vector , HYPRE_Complex (*fcn )(HYPRE_Int, HYPRE_Int, HYPRE_Int));
+HYPRE_Int hypre_StructVectorSetFunctionValues ( hypre_StructVector *vector , HYPRE_Complex (*fcn )());
 HYPRE_Int hypre_StructVectorClearGhostValues ( hypre_StructVector *vector );
 HYPRE_Int hypre_StructVectorClearBoundGhostValues ( hypre_StructVector *vector , HYPRE_Int force );
 HYPRE_Int hypre_StructVectorScaleValues ( hypre_StructVector *vector , HYPRE_Complex factor );
@@ -2152,7 +4500,6 @@ hypre_StructVector *hypre_StructVectorRead ( MPI_Comm comm , const char *filenam
 HYPRE_Int hypre_StructVectorMaxValue ( hypre_StructVector *vector , HYPRE_Real *max_value , HYPRE_Int *max_index , hypre_Index max_xyz_index );
 hypre_StructVector *hypre_StructVectorClone ( hypre_StructVector *vector );
 
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/struct_mv/assumed_part.c b/src/struct_mv/assumed_part.c
index a3be50f..2246f12 100644
--- a/src/struct_mv/assumed_part.c
+++ b/src/struct_mv/assumed_part.c
@@ -158,9 +158,7 @@ hypre_APSubdivideRegion( hypre_Box      *region,
    }
 
    count = 0;
-   hypre_BoxLoop0Begin(ndim, div);
-   hypre_BoxLoopSetOneBlock();
-   hypre_BoxLoop0For()
+   hypre_SerialBoxLoop0Begin(ndim, div);
    {
       box = hypre_BoxArrayBox(box_array, count);
       hypre_BoxLoopGetIndex(index);
@@ -172,7 +170,7 @@ hypre_APSubdivideRegion( hypre_Box      *region,
       }
       count++;
    }
-   hypre_BoxLoop0End();
+   hypre_SerialBoxLoop0End();
 
    /* clean up */
    for (i = 0; i < ndim; i++) 
@@ -1635,7 +1633,7 @@ hypre_StructAssumedPartitionGetProcsFromBox(
 
    HYPRE_Int       i, d, p, q, r, myid;
    HYPRE_Int       num_regions, in_regions, this_region, proc_count, proc_start;
-   HYPRE_Int       adj_proc_id, extra, num_partitions, part_num;
+   HYPRE_Int       adj_proc_id, extra, num_partitions;
    HYPRE_Int       width;
    
    HYPRE_Int      *proc_array, proc_array_count;
@@ -1772,9 +1770,7 @@ hypre_StructAssumedPartitionGetProcsFromBox(
       hypre_SetIndex(stride, 1);
       hypre_BoxGetSize(part_box, loop_size);
       hypre_BoxSetExtents(part_dbox, stride, div);
-      hypre_BoxLoop1Begin(ndim, loop_size, part_dbox, start, stride, part_num);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop1For(part_num)
+      hypre_SerialBoxLoop1Begin(ndim, loop_size, part_dbox, start, stride, part_num);
       {
          /*convert the partition number to a processor number*/
          if (part_num < (2*extra))
@@ -1795,7 +1791,7 @@ hypre_StructAssumedPartitionGetProcsFromBox(
          proc_ids[num_proc_ids] = adj_proc_id + proc_start;
          num_proc_ids++;
       }
-      hypre_BoxLoop1End(part_num);
+      hypre_SerialBoxLoop1End(part_num);
 
    } /*end of for each region loop*/
 
diff --git a/src/struct_mv/box.h b/src/struct_mv/box.h
index cd1a24c..26d4b5c 100644
--- a/src/struct_mv/box.h
+++ b/src/struct_mv/box.h
@@ -170,329 +170,155 @@ for (i = 0; i < hypre_BoxArrayArraySize(box_array_array); i++)
  * BoxLoop macros:
  *--------------------------------------------------------------------------*/
 
-#if 0 /* set to 0 to use the new box loops */
-
-#define HYPRE_BOX_PRIVATE hypre__nx,hypre__ny,hypre__nz,hypre__i,hypre__j,hypre__k
-
-#define hypre_BoxLoopDeclareS(dbox, stride, sx, sy, sz) \
-HYPRE_Int  sx = (hypre_IndexX(stride));\
-HYPRE_Int  sy = (hypre_IndexY(stride)*hypre_BoxSizeX(dbox));\
-HYPRE_Int  sz = (hypre_IndexZ(stride)*\
-           hypre_BoxSizeX(dbox)*hypre_BoxSizeY(dbox))
-
-#define hypre_BoxLoopDeclareN(loop_size) \
-HYPRE_Int  hypre__i, hypre__j, hypre__k;\
-HYPRE_Int  hypre__nx = hypre_IndexX(loop_size);\
-HYPRE_Int  hypre__ny = hypre_IndexY(loop_size);\
-HYPRE_Int  hypre__nz = hypre_IndexZ(loop_size);\
-HYPRE_Int  hypre__mx = hypre__nx;\
-HYPRE_Int  hypre__my = hypre__ny;\
-HYPRE_Int  hypre__mz = hypre__nz;\
-HYPRE_Int  hypre__dir, hypre__max;\
-HYPRE_Int  hypre__div, hypre__mod;\
-HYPRE_Int  hypre__block, hypre__num_blocks;\
-hypre__dir = 0;\
-hypre__max = hypre__nx;\
-if (hypre__ny > hypre__max)\
-{\
-   hypre__dir = 1;\
-   hypre__max = hypre__ny;\
-}\
-if (hypre__nz > hypre__max)\
-{\
-   hypre__dir = 2;\
-   hypre__max = hypre__nz;\
-}\
-hypre__num_blocks = hypre_NumThreads();\
-if (hypre__max < hypre__num_blocks)\
-{\
-   hypre__num_blocks = hypre__max;\
-}\
-if (hypre__num_blocks > 0)\
-{\
-   hypre__div = hypre__max / hypre__num_blocks;\
-   hypre__mod = hypre__max % hypre__num_blocks;\
-}
+#ifdef HYPRE_USE_RAJA
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+//ReduceSum< cuda_reduce<BLOCKSIZE>, HYPRE_Real> local_result(0.0);
+#else
+#define hypre_Reductioninit(local_result)\
+HYPRE_Real       local_result;\
+local_result = 0.0;
+#endif
 
-#define hypre_BoxLoopSet(i, j, k) \
-i = 0;\
-j = 0;\
-k = 0;\
-hypre__nx = hypre__mx;\
-hypre__ny = hypre__my;\
-hypre__nz = hypre__mz;\
-if (hypre__num_blocks > 1)\
+#if defined(HYPRE_MEMORY_GPU)
+
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)\
+HYPRE_Int * indices_d;\
+HYPRE_Int indices_h[stencil_size];\
+HYPRE_Int * stencil_shape_d;\
+HYPRE_Int  stencil_shape_h[size*stencil_size];\
+HYPRE_Complex * data_A = hypre_StructMatrixData(A);\
+indices_d = hypre_DeviceTAlloc(HYPRE_Int, stencil_size);\
+stencil_shape_d = hypre_DeviceTAlloc(HYPRE_Int, size*stencil_size);\
+for (HYPRE_Int ii = 0; ii < stencil_size; ii++)\
 {\
-   if (hypre__dir == 0)\
-   {\
-      i = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 1)\
-   {\
-      j = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-   else if (hypre__dir == 2)\
-   {\
-      k = hypre__block * hypre__div + hypre_min(hypre__mod, hypre__block);\
-      hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0);\
-   }\
-}
+   HYPRE_Int jj = 0;\
+   indices_h[ii]       = hypre_StructMatrixDataIndices(A)[i][ii];\
+   if (size > 1) cdir = 0;\
+   stencil_shape_h[ii] = hypre_IndexD(stencil_shape[ii], cdir);\
+   for (jj = 1;jj < size;jj++)\
+      stencil_shape_h[jj*stencil_size+ii] = hypre_IndexD(stencil_shape[ii], jj);\
+}\
+hypre_DataCopyToData(indices_h,indices_d,HYPRE_Int,stencil_size);\
+hypre_DataCopyToData(stencil_shape_h,stencil_shape_d,HYPRE_Int,size*stencil_size);\
 
-#define hypre_BoxLoopGetIndex(index) \
-index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+#define hypre_StructGetMatrixBoxData(A, i, si)  (data_A + indices_d[si])
 
-/* Use this before the For macros below to force only one block */
-#define hypre_BoxLoopSetOneBlock() hypre__num_blocks = 1
-
-/* Use this to get the block iteration inside a BoxLoop */
-#define hypre_BoxLoopBlock() hypre__block
+#define hypre_StructGetIndexD(index,i,index_d) (index_d)
 
-/*-----------------------------------*/
+#define hypre_StructCleanIndexD()\
+hypre_DeviceTFree(indices_d);\
+hypre_DeviceTFree(stencil_shape_d);
 
-#define hypre_BoxLoop0Begin(ndim, loop_size)\
-{\
-   hypre_BoxLoopDeclareN(loop_size);
+#define hypre_StructPreparePrint()\
+HYPRE_Int tot_size = num_values*hypre_BoxVolume(hypre_BoxArrayBox(data_space, hypre_BoxArraySize(box_array)-1));\
+data_host = hypre_CTAlloc(HYPRE_Complex, tot_size);\
+hypre_DataCopyFromData(data_host,data,HYPRE_Complex,tot_size);
 
-#define hypre_BoxLoop0For()\
-   hypre__BoxLoop0For(hypre__i, hypre__j, hypre__k)
-#define hypre__BoxLoop0For(i, j, k)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
-
-#define hypre_BoxLoop0End()\
-         }\
-      }\
-   }\
-   }\
-}
-  
-/*-----------------------------------*/
+#define hypre_StructPostPrint() hypre_TFree(data_host)
 
-#define hypre_BoxLoop1Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1)\
-{\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareN(loop_size);
+#else
 
-#define hypre_BoxLoop1For(i1)\
-   hypre__BoxLoop1For(hypre__i, hypre__j, hypre__k, i1)
-#define hypre__BoxLoop1For(i, j, k, i1)\
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
-   {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
-      {\
-         for (i = 0; i < hypre__nx; i++)\
-         {
+#define hypre_MatrixIndexMove(A, stencil_size, i, cdir,size)
+#define hypre_StructGetMatrixBoxData(A, i, si) hypre_StructMatrixBoxData(A,i,si)
+#define hypre_StructGetIndexD(index,i,index_d) hypre_IndexD(index,i)
+#define hypre_StructCleanIndexD() {;}
+#define hypre_StructPreparePrint() data_host = data;
+#define hypre_StructPostPrint() {;}
 
-#define hypre_BoxLoop1End(i1)\
-            i1 += hypre__sx1;\
-         }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-      }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-   }\
-   }\
-}
+#endif
   
-/*-----------------------------------*/
-
-#define hypre_BoxLoop2Begin(ndim,loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2)\
+#define hypre_SerialBoxLoop0Begin(ndim, loop_size)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop2For(i1, i2)\
-   hypre__BoxLoop2For(hypre__i, hypre__j, hypre__k, i1, i2)
-#define hypre__BoxLoop2For(i, j, k, i1, i2)\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   hypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop2End(i1, i2)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
+#define hypre_SerialBoxLoop0End()\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop3Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3)\
+#define hypre_SerialBoxLoop1Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop3For(i1, i2, i3)\
-   hypre__BoxLoop3For(hypre__i, hypre__j, hypre__k, i1, i2, i3)
-#define hypre__BoxLoop3For(i, j, k, i1, i2, i3)\
+   HYPRE_Int i1;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop3End(i1, i2, i3)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
+#define hypre_SerialBoxLoop1End(i1)\
+            i1 += hypre__i0inc1;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
-#define hypre_BoxLoop4Begin(ndim, loop_size,\
-                            dbox1, start1, stride1, i1,\
-                            dbox2, start2, stride2, i2,\
-                            dbox3, start3, stride3, i3,\
-                            dbox4, start4, stride4, i4)\
+#define hypre_SerialBoxLoop2Begin(ndim, loop_size,\
+                                  dbox1, start1, stride1, i1,\
+                                  dbox2, start2, stride2, i2)\
 {\
-   HYPRE_Int  hypre__i1start = hypre_BoxIndexRank(dbox1, start1);\
-   HYPRE_Int  hypre__i2start = hypre_BoxIndexRank(dbox2, start2);\
-   HYPRE_Int  hypre__i3start = hypre_BoxIndexRank(dbox3, start3);\
-   HYPRE_Int  hypre__i4start = hypre_BoxIndexRank(dbox4, start4);\
-   hypre_BoxLoopDeclareS(dbox1, stride1, hypre__sx1, hypre__sy1, hypre__sz1);\
-   hypre_BoxLoopDeclareS(dbox2, stride2, hypre__sx2, hypre__sy2, hypre__sz2);\
-   hypre_BoxLoopDeclareS(dbox3, stride3, hypre__sx3, hypre__sy3, hypre__sz3);\
-   hypre_BoxLoopDeclareS(dbox4, stride4, hypre__sx4, hypre__sy4, hypre__sz4);\
-   hypre_BoxLoopDeclareN(loop_size);
-
-#define hypre_BoxLoop4For(i1, i2, i3, i4)\
-   hypre__BoxLoop4For(hypre__i, hypre__j, hypre__k, i1, i2, i3, i4)
-#define hypre__BoxLoop4For(i, j, k, i1, i2, i3, i4)\
+   HYPRE_Int i1,i2;\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopSetOneBlock();\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
-   hypre_BoxLoopSet(i, j, k);\
-   i1 = hypre__i1start + i*hypre__sx1 + j*hypre__sy1 + k*hypre__sz1;\
-   i2 = hypre__i2start + i*hypre__sx2 + j*hypre__sy2 + k*hypre__sz2;\
-   i3 = hypre__i3start + i*hypre__sx3 + j*hypre__sy3 + k*hypre__sz3;\
-   i4 = hypre__i4start + i*hypre__sx4 + j*hypre__sy4 + k*hypre__sz4;\
-   for (k = 0; k < hypre__nz; k++)\
-   {\
-      for (j = 0; j < hypre__ny; j++)\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
       {\
-         for (i = 0; i < hypre__nx; i++)\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
          {
 
-#define hypre_BoxLoop4End(i1, i2, i3, i4)\
-            i1 += hypre__sx1;\
-            i2 += hypre__sx2;\
-            i3 += hypre__sx3;\
-            i4 += hypre__sx4;\
+#define hypre_SerialBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
          }\
-         i1 += hypre__sy1 - hypre__nx*hypre__sx1;\
-         i2 += hypre__sy2 - hypre__nx*hypre__sx2;\
-         i3 += hypre__sy3 - hypre__nx*hypre__sx3;\
-         i4 += hypre__sy4 - hypre__nx*hypre__sx4;\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
       }\
-      i1 += hypre__sz1 - hypre__ny*hypre__sy1;\
-      i2 += hypre__sz2 - hypre__ny*hypre__sy2;\
-      i3 += hypre__sz3 - hypre__ny*hypre__sy3;\
-      i4 += hypre__sz4 - hypre__ny*hypre__sy4;\
-   }\
    }\
 }
 
-/*-----------------------------------*/
-
+#if defined (HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)
+#define HYPRE_BOX_PRIVATE hypre__global_error
 #else
-
-#define HYPRE_BOX_PRIVATE        ZYPRE_BOX_PRIVATE
-
-#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
-#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
-#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
-#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
-#define hypre_BoxLoop0For        zypre_BoxLoop0For
-#define hypre_BoxLoop0End        zypre_BoxLoop0End
-#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
-#define hypre_BoxLoop1For        zypre_BoxLoop1For
-#define hypre_BoxLoop1End        zypre_BoxLoop1End
-#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
-#define hypre_BoxLoop2For        zypre_BoxLoop2For
-#define hypre_BoxLoop2End        zypre_BoxLoop2End
-#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
-#define hypre_BoxLoop3For        zypre_BoxLoop3For
-#define hypre_BoxLoop3End        zypre_BoxLoop3End
-#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
-#define hypre_BoxLoop4For        zypre_BoxLoop4For
-#define hypre_BoxLoop4End        zypre_BoxLoop4End
-
-#endif /* end if 1 */
-
+#define HYPRE_BOX_PRIVATE ZYPRE_BOX_PRIVATE
 #endif
-
-/******************************************************************************
- *
- * NEW BoxLoop STUFF
- *
- *****************************************************************************/
-
-#ifndef hypre_ZBOX_HEADER
-#define hypre_ZBOX_HEADER
-
 #define ZYPRE_BOX_PRIVATE hypre__IN,hypre__JN,hypre__I,hypre__J,hypre__d,hypre__i
 
-/*--------------------------------------------------------------------------
- * BoxLoop macros:
- *--------------------------------------------------------------------------*/
-
 #define zypre_BoxLoopDeclare() \
 HYPRE_Int  hypre__tot, hypre__div, hypre__mod;\
 HYPRE_Int  hypre__block, hypre__num_blocks;\
@@ -629,6 +455,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop1Begin(ndim, loop_size,\
                             dbox1, start1, stride1, i1)\
 {\
+   HYPRE_Int i1;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopInit(ndim, loop_size);\
@@ -637,6 +464,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop1For(i1)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
@@ -660,6 +488,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
                             dbox1, start1, stride1, i1,\
                             dbox2, start2, stride2, i2)\
 {\
+   HYPRE_Int i1,i2;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -670,6 +499,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop2For(i1, i2)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -697,6 +527,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
                             dbox2, start2, stride2, i2,\
                             dbox3, start3, stride3, i3)\
 {\
+   HYPRE_Int i1,i2,i3;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -709,6 +540,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop3For(i1, i2, i3)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -740,6 +572,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
                             dbox3, start3, stride3, i3,\
                             dbox4, start4, stride4, i4)\
 {\
+   HYPRE_Int i1,i2,i3,i4;\
    zypre_BoxLoopDeclare();\
    zypre_BoxLoopDeclareK(1);\
    zypre_BoxLoopDeclareK(2);\
@@ -754,6 +587,7 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 #define zypre_BoxLoop4For(i1, i2, i3, i4)\
    for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
    {\
+      HYPRE_Int i1,i2,i3,i4;\
       zypre_BoxLoopSet();\
       zypre_BoxLoopSetK(1, i1);\
       zypre_BoxLoopSetK(2, i2);\
@@ -782,6 +616,32 @@ for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
 
 /*-----------------------------------*/
 
+#define zypre_BasicBoxLoopInitK(k, stridek) \
+hypre__sk##k[0] = stridek[0];\
+hypre__ikinc##k[0] = 0;\
+for (hypre__d = 1; hypre__d < hypre__ndim; hypre__d++)\
+{\
+   hypre__sk##k[hypre__d] = stridek[hypre__d];\
+   hypre__ikinc##k[hypre__d] = hypre__ikinc##k[hypre__d-1] +\
+      hypre__sk##k[hypre__d] - hypre__n[hypre__d-1]*hypre__sk##k[hypre__d-1];\
+}\
+hypre__i0inc##k = hypre__sk##k[0];\
+hypre__ikinc##k[hypre__ndim] = 0;\
+hypre__ikstart##k = 0
+
+#define zypre_BasicBoxLoop2Begin(ndim, loop_size,\
+                                 stride1, i1,\
+                                 stride2, i2)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BasicBoxLoopInitK(1, stride1);\
+   zypre_BasicBoxLoopInitK(2, stride2);
+
+/*-----------------------------------*/
+
 #endif
 
 
diff --git a/src/struct_mv/box_manager.c b/src/struct_mv/box_manager.c
index 0e217f1..cdca432 100644
--- a/src/struct_mv/box_manager.c
+++ b/src/struct_mv/box_manager.c
@@ -2226,7 +2226,7 @@ hypre_BoxManAssemble( hypre_BoxManager *manager )
       HYPRE_Int  size[HYPRE_MAXDIM];
       HYPRE_Int  iminmax[2];
       HYPRE_Int  index_not_there;
-      HYPRE_Int  d, e, ii, itsize;
+      HYPRE_Int  d, e, itsize;
       HYPRE_Int  mystart, myfinish;
       HYPRE_Int  imin[HYPRE_MAXDIM];
       HYPRE_Int  imax[HYPRE_MAXDIM];
@@ -2388,9 +2388,7 @@ hypre_BoxManAssemble( hypre_BoxManager *manager )
             /* set up index table */
             hypre_BoxSetExtents(index_box, imin, imax);
             hypre_BoxGetSize(index_box, loop_size);
-            hypre_BoxLoop1Begin(ndim, loop_size, table_box, imin, stride, ii);
-            hypre_BoxLoopSetOneBlock();
-            hypre_BoxLoop1For(ii)
+            hypre_SerialBoxLoop1Begin(ndim, loop_size, table_box, imin, stride, ii);
             {
                if (!index_table[ii]) /* no entry- add one */
                {
@@ -2403,7 +2401,7 @@ hypre_BoxManAssemble( hypre_BoxManager *manager )
                   index_table[ii] = entry;
                }
             }
-            hypre_BoxLoop1End(ii);
+            hypre_SerialBoxLoop1End(ii);
 
          } /* end of subset of entries */
       }/* end of three loops over subsets */
@@ -2464,7 +2462,7 @@ hypre_BoxManIntersect ( hypre_BoxManager *manager,
                         HYPRE_Int *nentries_ptr )
 {
    HYPRE_Int           ndim = hypre_BoxManNDim(manager);
-   HYPRE_Int           d, ii;
+   HYPRE_Int           d;
    HYPRE_Int           find_index_d, current_index_d;
    HYPRE_Int          *man_indexes_d;
    HYPRE_Int           man_index_size_d;
@@ -2581,9 +2579,7 @@ hypre_BoxManIntersect ( hypre_BoxManager *manager,
    hypre_BoxShiftNeg(table_box, stride); /* Want box to start at 0*/
    hypre_BoxSetExtents(index_box, man_ilower, man_iupper);
    hypre_BoxGetSize(index_box, loop_size);
-   hypre_BoxLoop1Begin(ndim, loop_size, table_box, man_ilower, stride, ii);
-   hypre_BoxLoopSetOneBlock();
-   hypre_BoxLoop1For(ii)
+   hypre_SerialBoxLoop1Begin(ndim, loop_size, table_box, man_ilower, stride, ii);
    {
       entry = index_table[ii];
 
@@ -2601,7 +2597,7 @@ hypre_BoxManIntersect ( hypre_BoxManager *manager,
          entry = hypre_BoxManEntryNext(entry);
       }
    }
-   hypre_BoxLoop1End(ii);
+   hypre_SerialBoxLoop1End(ii);
 
    entries  = hypre_TReAlloc(entries, hypre_BoxManEntry *, nentries);
 
diff --git a/src/struct_mv/boxloop.h b/src/struct_mv/boxloop.h
new file mode 100644
index 0000000..0583d16
--- /dev/null
+++ b/src/struct_mv/boxloop.h
@@ -0,0 +1,384 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#ifdef HYPRE_USING_OPENMP
+#define Pragma(x) _Pragma(#x)
+#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_SMP_SCHEDULE)
+#define OMPREDUCTION() Pragma(omp parallel for private(HYPRE_BOX_PRIVATE,HYPRE_BOX_PRIVATE_VAR) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#else
+#define OMP1
+#define OMPREDUCTION() ;
+#endif
+
+typedef struct hypre_Boxloop_struct
+  {
+    HYPRE_Int lsize0,lsize1,lsize2;
+    HYPRE_Int strides0,strides1,strides2;
+    HYPRE_Int bstart0,bstart1,bstart2;
+    HYPRE_Int bsize0,bsize1,bsize2;
+  }hypre_Boxloop;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)				\
+{\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop0End()\
+         }\
+         zypre_BoxLoopInc1();\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+                               dbox1, start1, stride1, i1)              \
+	{														\
+	zypre_BoxLoopDeclare();									\
+	zypre_BoxLoopDeclareK(1);								\
+	zypre_BoxLoopInit(ndim, loop_size);						\
+	zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);					\
+	OMP1\
+	for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+	{																	\
+		zypre_BoxLoopSet();												\
+		zypre_BoxLoopSetK(1, i1);										\
+		for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)			\
+		{																\
+			for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)		\
+			{
+
+#define zypre_newBoxLoop1End(i1)				\
+	             i1 += hypre__i0inc1;						\
+		    }											\
+			zypre_BoxLoopInc1();					\
+	        i1 += hypre__ikinc1[hypre__d];				\
+	        zypre_BoxLoopInc2();						\
+		}											\
+	}											\
+}
+
+
+#define zypre_newBoxLoop2Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2)	\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop2End(i1, i2)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,\
+							   dbox1, start1, stride1, i1,	\
+							   dbox2, start2, stride2, i2,	\
+							   dbox3, start3, stride3, i3)	\
+{														\
+   zypre_BoxLoopDeclare();									\
+   zypre_BoxLoopDeclareK(1);								\
+   zypre_BoxLoopDeclareK(2);								\
+   zypre_BoxLoopDeclareK(3);								\
+   zypre_BoxLoopInit(ndim, loop_size);						\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);		\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);		\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);		\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop3End(i1, i2, i3)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,\
+                            dbox1, start1, stride1, i1,\
+                            dbox2, start2, stride2, i2,\
+                            dbox3, start3, stride3, i3,\
+                            dbox4, start4, stride4, i4)\
+{\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopDeclareK(3);\
+   zypre_BoxLoopDeclareK(4);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);\
+   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);\
+   OMP1\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      zypre_BoxLoopSetK(3, i3);\
+      zypre_BoxLoopSetK(4, i4);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+            i3 += hypre__i0inc3;\
+            i4 += hypre__i0inc4;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         i3 += hypre__ikinc3[hypre__d];\
+         i4 += hypre__ikinc4[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+                                        sum)				\
+{									\
+   zypre_BoxLoopDeclare();						\
+   zypre_BoxLoopDeclareK(1);						\
+   zypre_BoxLoopInit(ndim, loop_size);					\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);			\
+   OMPREDUCTION()							\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)\
+            i1 += hypre__i0inc1;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,				\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,	\
+                                        sum)							\
+{\
+   HYPRE_Int i1,i2;				\
+   zypre_BoxLoopDeclare();\
+   zypre_BoxLoopDeclareK(1);\
+   zypre_BoxLoopDeclareK(2);\
+   zypre_BoxLoopInit(ndim, loop_size);\
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);\
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);\
+   OMPREDUCTION()														\
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)	\
+   {\
+      zypre_BoxLoopSet();\
+      zypre_BoxLoopSetK(1, i1);\
+      zypre_BoxLoopSetK(2, i2);\
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)\
+      {\
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)\
+         {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)\
+            i1 += hypre__i0inc1;\
+            i2 += hypre__i0inc2;\
+         }\
+         zypre_BoxLoopInc1();\
+         i1 += hypre__ikinc1[hypre__d];\
+         i2 += hypre__ikinc2[hypre__d];\
+         zypre_BoxLoopInc2();\
+      }\
+   }\
+}
+
+#define hypre_LoopBegin(size,idx)			\
+{									\
+   HYPRE_Int idx;							\
+   for (idx = 0;idx < size;idx ++)					\
+   {
+
+#define hypre_LoopEnd()					\
+  }							\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    HYPRE_Int hypre__tot = 1;						\
+    hypre_Boxloop databox1;						\
+    HYPRE_Int d,idx;							\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (d = 0;d < ndim;d ++)						\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    for (idx = 0;idx < hypre__tot;idx++)				\
+      {									\
+	  HYPRE_Int local_idx;						\
+	  HYPRE_Int idx_local = idx;					\
+	  HYPRE_Int i1 = 0;						\
+	  local_idx  = idx_local % databox1.lsize0;			\
+	  idx_local  = idx_local / databox1.lsize0;			\
+	  i1 += local_idx*databox1.strides0;				\
+	  local_idx  = idx_local % databox1.lsize1;			\
+	  idx_local  = idx_local / databox1.lsize1;			\
+	  i1 += local_idx*databox1.strides1;				\
+	  local_idx  = idx_local % databox1.lsize2;			\
+	  idx_local  = idx_local / databox1.lsize2;			\
+	  i1 += local_idx*databox1.strides2;				\
+
+
+#define hypre_BoxBoundaryCopyEnd()					\
+  }									\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,			\
+                                   stride1, i1,				\
+                                   stride2, i2)				\
+{									\
+   HYPRE_Int hypre__tot = 1,idx;					\
+   hypre_Boxloop databox1,databox2;					\
+   HYPRE_Int d;								\
+   databox1.lsize0 = loop_size[0];					\
+   databox1.lsize1 = loop_size[1];					\
+   databox1.lsize2 = loop_size[2];					\
+   databox1.strides0 = stride1[0];					\
+   databox1.strides1 = stride1[1];					\
+   databox1.strides2 = stride1[2];					\
+   databox2.lsize0 = loop_size[0];					\
+   databox2.lsize1 = loop_size[1];					\
+   databox2.lsize2 = loop_size[2];					\
+   databox2.strides0 = stride2[0];					\
+   databox2.strides1 = stride2[1];					\
+   databox2.strides2 = stride2[2];					\
+   for (d = 0;d < ndim;d ++)						\
+   {									\
+      hypre__tot *= loop_size[d];					\
+   }									\
+   for (idx = 0;idx < hypre__tot;idx++)					\
+   {									\
+      HYPRE_Int local_idx;						\
+      HYPRE_Int idx_local = idx;					\
+      HYPRE_Int i1 = 0, i2 = 0;						\
+      local_idx  = idx_local % databox1.lsize0;				\
+      idx_local  = idx_local / databox1.lsize0;				\
+      i1 += local_idx*databox1.strides0;				\
+      i2 += local_idx*databox2.strides0;				\
+      local_idx  = idx_local % databox1.lsize1;				\
+      idx_local  = idx_local / databox1.lsize1;				\
+      i1 += local_idx*databox1.strides1;				\
+      i2 += local_idx*databox2.strides1;				\
+      local_idx  = idx_local % databox1.lsize2;				\
+      idx_local  = idx_local / databox1.lsize2;				\
+      i1 += local_idx*databox1.strides2;				\
+      i2 += local_idx*databox2.strides2;
+
+#define hypre_BoxDataExchangeEnd()					\
+   }                                                                    \
+}
+
+#define hypre_newBoxLoopGetIndex zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
+#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
+#define hypre_BoxLoop0Begin      zypre_BoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_BoxLoop0For
+#define hypre_BoxLoop0End        zypre_BoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_BoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_BoxLoop1For
+#define hypre_BoxLoop1End        zypre_BoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_BoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_BoxLoop2For
+#define hypre_BoxLoop2End        zypre_BoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_BoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_BoxLoop3For
+#define hypre_BoxLoop3End        zypre_BoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_BoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_BoxLoop4For
+#define hypre_BoxLoop4End        zypre_BoxLoop4End
+#define hypre_BasicBoxLoop2Begin zypre_BasicBoxLoop2Begin
+
+#endif
diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h
new file mode 100644
index 0000000..03db355
--- /dev/null
+++ b/src/struct_mv/boxloop_cuda.h
@@ -0,0 +1,717 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_traversal {HYPRE_Int cuda;};
+struct omp_traversal  {HYPRE_Int omp;};
+#define hypre_exec_policy cuda_traversal()
+#define HYPER_LAMBDA [=] __device__
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+    }
+}
+#define BLOCKSIZE 128
+
+#define hypre_fence() \
+  cudaError err = cudaGetLastError();		\
+if ( cudaSuccess != err )\
+{\
+   printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__);\
+}									\
+AxCheckError(cudaDeviceSynchronize());
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void forall_kernel(LOOP_BODY loop_body, HYPRE_Int length)
+{
+	HYPRE_Int idx = blockDim.x * blockIdx.x + threadIdx.x;
+	if (idx < length)
+		loop_body(idx);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (cuda_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{	
+	size_t const blockSize = 128;
+	size_t gridSize  = (length + blockSize - 1) / blockSize;
+	if (gridSize == 0) gridSize = 1;
+	
+	//hypre_printf("length= %d, blocksize = %d, gridsize = %d\n",length,blockSize,gridSize);
+	forall_kernel<<<gridSize, blockSize>>>(loop_body,length);
+}
+
+template<typename LOOP_BODY>
+void BoxLoopforall (omp_traversal, HYPRE_Int length, LOOP_BODY loop_body)
+{
+
+#pragma omp parallel for schedule(static)
+	for (HYPRE_Int idx = 0;idx < length;idx++)
+		loop_body(idx);
+}
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{       								\
+HYPRE_Int idx = idx_local;						\
+local_idx  = idx % box.lsize0;					\
+idx        = idx / box.lsize0;					\
+i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+local_idx  = idx % box.lsize1;					\
+idx        = idx / box.lsize1;					\
+i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+local_idx  = idx % box.lsize2;					\
+idx  = idx / box.lsize2;					\
+i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+template<class T>
+__global__ void reduction_mult (T * a, T * b, HYPRE_Int hypre__tot,
+				hypre_Boxloop box1)
+{
+    HYPRE_Int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local = id;
+    HYPRE_Int hypre_boxD1 = 1;
+    HYPRE_Int i1 = 0;
+    //// reducted output
+    __shared__ T shared_cache [BLOCKSIZE];
+    T sum = 1;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    if (id < hypre__tot)
+      sum = a[i1];
+    *(shared_cache + threadIdx.x) = sum;
+    
+    __syncthreads();
+    
+    ///////// sum of internal cache
+    
+    HYPRE_Int i;    
+    
+    for (i=(BLOCKSIZE /2); i>0 ; i= i/2){
+      if (threadIdx.x < i){
+	*(shared_cache + threadIdx.x) *= *(shared_cache + threadIdx.x + i);
+      }
+      __syncthreads();
+    }
+    
+    if ( threadIdx.x == 0){
+      *(b+ blockIdx.x) = shared_cache[0];
+    }
+}
+}
+
+#define hypre_BoxLoopInit(ndim,loop_size)					\
+	HYPRE_Int hypre__tot = 1;											\
+	for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		hypre__tot *= loop_size[i];
+
+
+#define hypre_newBoxLoopDeclare()\
+	HYPRE_Int hypre__i,hypre__j,hypre__k;\
+	HYPRE_Int idx_local = idx;
+
+#define hypre_newBoxLoop0Begin(ndim, loop_size)				\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {
+
+#define hypre_newBoxLoop0End()					\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;											\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{							      	\
+	      databox##k.lsize2 = loop_size[2];				\
+	      databox##k.strides2 = stride[2];				\
+	      databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	      databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	}				                        	\
+	else						        	\
+	{							       	\
+		databox##k.lsize2 = 1;				       	\
+		databox##k.strides2 = 0;		       		\
+		databox##k.bstart2  = 0;	       			\
+		databox##k.bsize2   = 0;		       		\
+	}
+
+#define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);				\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+      hypre_newBoxLoopDeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      hypre__i  = idx_local % databox1.lsize0;				\
+      idx_local = idx_local / databox1.lsize0;				\
+      i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+      hypre__j  = idx_local % databox1.lsize1;				\
+      idx_local = idx_local / databox1.lsize1;				\
+      i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+      hypre__k  = idx_local % databox1.lsize2;				\
+      idx_local = idx_local / databox1.lsize2;				\
+      i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+      hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+      
+#define hypre_newBoxLoop1End(i1)				\
+    });									\
+    hypre_fence();							\
+}
+	
+#define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+#define hypre_newBoxLoop2End(i1, i2)			\
+    });							\
+    hypre_fence();					\
+}
+
+#define hypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+{									\
+    hypre_BoxLoopInit(ndim,loop_size);						\
+    hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	hypre__i  = idx_local % databox1.lsize0;				\
+	idx_local  = idx_local / databox1.lsize0;				\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	hypre__j   = idx_local % databox1.lsize1;				\
+	idx_local  = idx_local / databox1.lsize1;				\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	hypre__k  = idx_local % databox1.lsize2;				\
+	idx_local  = idx_local / databox1.lsize2;				\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	i3 += (hypre__k*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	
+
+#define hypre_newBoxLoop3End(i1, i2,i3)			\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+     hypre_BoxLoopInit(ndim,loop_size);			       \
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+     {									\
+        hypre_newBoxLoopDeclare();					\
+	HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (hypre__i*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (hypre__i*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	i3 += (hypre__i*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	i4 += (hypre__i*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (hypre__j*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (hypre__j*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	i3 += (hypre__j*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	i4 += (hypre__j*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (hypre__k*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (hypre__k*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	i3 += (hypre__k*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	i4 += (hypre__k*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+		
+#define hypre_newBoxLoop4End(i1, i2, i3, i4)	\
+    });						\
+    hypre_fence();				\
+}
+
+#define MAX_BLOCK 512
+
+extern "C++" {
+template<class T>
+__inline__ __device__
+HYPRE_Int fake_shfl_down(T val, HYPRE_Int offset, HYPRE_Int width=32) {
+  static __shared__ T shared[MAX_BLOCK];
+  HYPRE_Int lane=threadIdx.x%32;
+
+  shared[threadIdx.x]=val;
+  __syncthreads();
+
+  val = (lane+offset<width) ? shared[threadIdx.x+offset] : 0;
+  __syncthreads();
+
+  return val;
+}
+
+template<class T>  
+__inline__ __device__
+HYPRE_Real warpReduceSum (T val) {
+  for (HYPRE_Int offset = warpSize/2; offset > 0; offset /= 2)
+    val += __shfl_down(val,offset);
+  return val;
+}
+
+
+template<class T> 
+__inline__ __device__
+HYPRE_Real blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  HYPRE_Int lane=threadIdx.x%warpSize;
+  HYPRE_Int wid=threadIdx.x/warpSize;
+  val=warpReduceSum<T>(val);
+
+  //write reduced value to shared memory
+  if(lane==0) shared[wid]=val;
+  __syncthreads();
+
+  //ensure we only grab a value from shared memory if that warp existed
+  val = (threadIdx.x<blockDim.x/warpSize) ? shared[lane] : HYPRE_Int(0);
+  if(wid==0) val=warpReduceSum<T>(val);
+
+  return val;
+}
+
+template<class T>
+__global__ void hypre_device_reduce_stable_kernel(T*a, T*b, T* out, HYPRE_Int N,
+						  hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int local_idx;
+  HYPRE_Int idx_local;
+  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+  HYPRE_Int i1 = 0, i2 = 0;
+  T sum=T(0);
+  HYPRE_Int i;
+  
+  for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+  {
+    idx_local = i;
+    local_idx  = idx_local % box1.lsize0;
+    idx_local  = idx_local / box1.lsize0;
+    i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+    i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+    local_idx  = idx_local % box1.lsize1;
+    idx_local  = idx_local / box1.lsize1;
+    i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+    i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+    hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+    local_idx  = idx_local % box1.lsize2;	      
+    idx_local  = idx_local / box1.lsize2;		      
+    i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+    hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+    i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+    hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+    sum += a[i1] * hypre_conj(b[i2]);
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>       
+__global__ void hypre_device_reduce_stable_kernel2(T *in, T* out, HYPRE_Int N) {
+  T sum=T(0);
+  for(HYPRE_Int i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x) {
+    sum+=in[i];
+  }
+  sum=blockReduceSum<T>(sum);
+  if(threadIdx.x==0)
+    out[blockIdx.x]=sum;
+}
+
+template<class T>   
+void hypre_device_reduce_stable(T*a,T*b, T* out, HYPRE_Int N,
+				hypre_Boxloop box1,hypre_Boxloop box2) {
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduce_stable_kernel<T><<<blocks,threads>>>(a,b,out,N,box1,box2);
+  hypre_device_reduce_stable_kernel2<T><<<1,1024>>>(out,out,blocks); 
+}
+
+}
+
+extern "C++" {
+template <typename LOOP_BODY>
+__global__ void hypre_device_reduction_kernel(HYPRE_Real* out,
+					      HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+					      LOOP_BODY loop_body)
+{
+    HYPRE_Int local_idx;
+    HYPRE_Int idx_local;
+    HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;
+    HYPRE_Int i1 = 0, i2 = 0;
+    HYPRE_Real sum = HYPRE_Real(0);
+    HYPRE_Int i;
+    
+    for(i=blockIdx.x*blockDim.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x)
+      {
+	idx_local = i;
+	local_idx  = idx_local % box1.lsize0;
+	idx_local  = idx_local / box1.lsize0;
+	i1 += (local_idx*box1.strides0 + box1.bstart0) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize0 + 1);
+	i2 += (local_idx*box2.strides0 + box2.bstart0) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize0 + 1);
+	local_idx  = idx_local % box1.lsize1;
+	idx_local  = idx_local / box1.lsize1;
+	i1 += (local_idx*box1.strides1 + box1.bstart1) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize1 + 1);
+	i2 += (local_idx*box2.strides1 + box2.bstart1) * hypre_boxD2;   
+	hypre_boxD2 *= hypre_max(0, box2.bsize1 + 1);	
+	local_idx  = idx_local % box1.lsize2;	      
+	idx_local  = idx_local / box1.lsize2;		      
+	i1 += (local_idx*box1.strides2 + box1.bstart2) * hypre_boxD1;
+	hypre_boxD1 *= hypre_max(0, box1.bsize2 + 1);	
+	i2 += (local_idx*box2.strides2 + box2.bstart2) * hypre_boxD2;
+	hypre_boxD2 *= hypre_max(0, box2.bsize2 + 1);
+	sum = loop_body(i1,i2,sum);
+      }
+    sum=blockReduceSum<HYPRE_Real>(sum);
+    if(threadIdx.x==0)
+      out[blockIdx.x]=sum;
+}
+
+template<typename LOOP_BODY>
+void hypre_device_reduction (HYPRE_Real* out,
+			     HYPRE_Int N,hypre_Boxloop box1,hypre_Boxloop box2,
+			     LOOP_BODY loop_body)
+{	
+  HYPRE_Int threads=512;
+  HYPRE_Int blocks=min((N+threads-1)/threads,1024);
+
+  hypre_device_reduction_kernel<<<blocks,threads>>>(out,N,box1,box2,loop_body);
+  hypre_device_reduce_stable_kernel2<HYPRE_Real><<<1,1024>>>(out,out,blocks);
+
+}
+}
+
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox1,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, sum)			\
+       return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum) \
+{    									   \
+   HYPRE_Real sum_old = sum;						\
+   sum = 0.0;								\
+   hypre_BoxLoopInit(ndim,loop_size);					\
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+   hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+   HYPRE_Real *d_c;							\
+   cudaMalloc((void**) &d_c, 1024 * sizeof(HYPRE_Real));		\
+   hypre_device_reduction(d_c,hypre__tot,databox1,databox2,HYPER_LAMBDA(HYPRE_Int i1, HYPRE_Int i2, HYPRE_Real sum) \
+   {
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+      return sum;								\
+   });									\
+  cudaMemcpy(&sum,d_c,sizeof(HYPRE_Real),cudaMemcpyDeviceToHost);	\
+  sum += sum_old;							\
+  cudaFree(d_c);							\
+}
+
+
+
+#define hypre_newBoxLoop1ReductionMult(ndim, loop_size,			  \
+				       dbox1, start1, stride1, i1,xp,sum) \
+{    									  \
+   HYPRE_Real sum_old = sum;\
+   sum = 1.0;\
+   hypre_BoxLoopInit(ndim,loop_size);				  \
+   hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	  \
+   HYPRE_Int n_blocks = (hypre__tot+BLOCKSIZE-1)/BLOCKSIZE;		  \
+   HYPRE_Real *d_b;				        		  \
+   HYPRE_Real * b = new HYPRE_Real[n_blocks];				  \
+   cudaMalloc((void**) &d_b, n_blocks * sizeof(HYPRE_Real));			\
+   reduction_mult<HYPRE_Real><<< n_blocks ,BLOCKSIZE>>>(xp,d_b,hypre__tot,databox1);		\
+   hypre_fence();							\
+   for (HYPRE_Int j = 0 ; j< n_blocks ; ++j){								\
+     sum *= b[j];							\
+   }	  		\
+   delete [] b;		\
+   sum *=sum_old;\
+}
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+	BoxLoopforall(hypre_exec_policy,size,HYPER_LAMBDA (HYPRE_Int idx) \
+	{
+
+#define hypre_LoopEnd()					\
+	});											\
+        hypre_fence();\
+}
+
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+	hypre__tot *= loop_size[d];					\
+    }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+	    hypre_newBoxLoopDeclare()											\
+	    HYPRE_Int i1 = 0;											\
+	    hypre__i  = idx_local % databox1.lsize0;			\
+	    idx_local  = idx_local / databox1.lsize0;			\
+	    i1 += hypre__i*databox1.strides0;				\
+	    hypre__j  = idx_local % databox1.lsize1;			\
+	    idx_local  = idx_local / databox1.lsize1;			\
+	    i1 += hypre__j*databox1.strides1;				\
+	    hypre__k  = idx_local % databox1.lsize2;			\
+	    idx_local  = idx_local / databox1.lsize2;			\
+	    i1 += hypre__k*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+    });									\
+    hypre_fence();							\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];									\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];									\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    BoxLoopforall(hypre_exec_policy,hypre__tot,HYPER_LAMBDA (HYPRE_Int idx) \
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	hypre__i  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += hypre__i*databox1.strides0;				\
+	i2 += hypre__i*databox2.strides0;				\
+	hypre__j  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += hypre__j*databox1.strides1;				\
+	i2 += hypre__j*databox2.strides1;				\
+	hypre__k  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += hypre__k*databox1.strides2;				\
+	i2 += hypre__k*databox2.strides2;
+
+
+#define hypre_BoxDataExchangeEnd()				\
+     });								\
+     hypre_fence();							\
+}
+  
+#define hypre_newBoxLoop0For()
+
+#define hypre_newBoxLoop1For(i1)
+
+#define hypre_newBoxLoop2For(i1, i2) 
+ 
+#define hypre_newBoxLoop3For(i1, i2, i3)
+
+#define hypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+  
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex  
+#define hypre_BoxLoopSetOneBlock() ; 
+#define hypre_BoxLoopBlock()       0
+
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+#endif
diff --git a/src/struct_mv/boxloop_kokkos.h b/src/struct_mv/boxloop_kokkos.h
new file mode 100644
index 0000000..f4a8113
--- /dev/null
+++ b/src/struct_mv/boxloop_kokkos.h
@@ -0,0 +1,542 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+extern "C++" {
+#include <Kokkos_Core.hpp>
+}
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+ typedef struct hypre_Boxloop_struct
+ {
+	 HYPRE_Int lsize0,lsize1,lsize2;
+	 HYPRE_Int strides0,strides1,strides2;
+	 HYPRE_Int bstart0,bstart1,bstart2;
+	 HYPRE_Int bsize0,bsize1,bsize2;
+ } hypre_Boxloop;
+
+ #if defined(HYPRE_MEMORY_GPU)
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ #define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+ inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+ {
+     if (err)
+     {
+	 printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+     }
+ }
+ #define BLOCKSIZE 256
+
+ #define hypre_fence() \
+ cudaError err = cudaGetLastError();\
+ if ( cudaSuccess != err ) {\
+ printf("\n ERROR hypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+ }\
+ AxCheckError(cudaDeviceSynchronize());
+ #elif defined(HYPRE_USE_OPENMP)
+    #define hypre_fence() ;
+ #elif defined(HYPRE_USING_OPENMP_ACC)
+ #define hypre_fence()  
+ #else 
+    #define hypre_fence();
+ #endif
+
+ #define hypre_newBoxLoopInit(ndim,loop_size)					\
+	 HYPRE_Int hypre__tot = 1;											\
+	 for (HYPRE_Int i = 0;i < ndim;i ++)									\
+		 hypre__tot *= loop_size[i];
+
+
+ #define hypre_BoxLoopIncK(k,box,i)					\
+ {									\
+    HYPRE_Int idx = idx_local;						\
+    local_idx  = idx % box.lsize0;					\
+    idx        = idx / box.lsize0;					\
+    i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+    local_idx  = idx % box.lsize1;					\
+    idx        = idx / box.lsize1;					\
+    i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+    local_idx  = idx % box.lsize2;					\
+    idx  = idx / box.lsize2;					\
+    i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+    hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+ }
+
+ #define hypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	 hypre_Boxloop databox##k;     					\
+	 databox##k.lsize0 = loop_size[0];				\
+	 databox##k.strides0 = stride[0];				\
+	 databox##k.bstart0  = start[0] - dbox->imin[0];		\
+	 databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	 if (ndim > 1)							\
+	 {								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	 }								\
+	 else						        	\
+	 {							       	\
+	    databox##k.lsize1 = 1;				       	\
+	    databox##k.strides1 = 0;					\
+	    databox##k.bstart1  = 0;					\
+	    databox##k.bsize1   = 0;					\
+	 }								\
+	 if (ndim == 3)							\
+	 {								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];	\
+	 }								\
+	 else								\
+	 {								\
+	   databox##k.lsize2 = 1;					\
+	   databox##k.strides2 = 0;					\
+	   databox##k.bstart2  = 0;					\
+	   databox##k.bsize2   = 0;					\
+	 }
+
+ #define hypre_newBoxLoopDeclare()										\
+	 HYPRE_Int local_idx;												\
+	 HYPRE_Int idx_local = idx;
+
+ #define hypre_newBoxLoop0Begin(ndim, loop_size) 	\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {
+
+
+ #define hypre_newBoxLoop0End(i1)				\
+	 });											\
+ }
+
+
+ #define hypre_newBoxLoop1Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1)		\
+ {									\
+     hypre_newBoxLoopInit(ndim,loop_size)						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+       hypre_newBoxLoopDeclare();						\
+       HYPRE_Int hypre_boxD1 = 1;					\
+       HYPRE_Int i1 = 0;							\
+       local_idx  = idx_local % databox1.lsize0;				\
+       idx_local  = idx_local / databox1.lsize0;				\
+       i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+       local_idx  = idx_local % databox1.lsize1;				\
+       idx_local  = idx_local / databox1.lsize1;				\
+       i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+       local_idx  = idx_local % databox1.lsize2;				\
+       idx_local  = idx_local / databox1.lsize2;				\
+       i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+       hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);
+
+
+ #define hypre_newBoxLoop1End(i1)					\
+     });									\
+     hypre_fence();							\
+ }
+
+
+ #define hypre_newBoxLoop2Begin(ndim, loop_size,				\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2)		\
+ {    														\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+ #define hypre_newBoxLoop2End(i1, i2)			\
+      });							\
+      hypre_fence();						\
+ }
+
+
+ #define hypre_newBoxLoop3Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3)		\
+ {																	\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+      hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+      hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+      hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+      Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+      {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	 local_idx  = idx_local % databox1.lsize0;				\
+	 idx_local  = idx_local / databox1.lsize0;				\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	 local_idx  = idx_local % databox1.lsize1;				\
+	 idx_local  = idx_local / databox1.lsize1;				\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	 local_idx  = idx_local % databox1.lsize2;				\
+	 idx_local  = idx_local / databox1.lsize2;				\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	 i3 += (local_idx*databox3.strides2 +databox3.bstart2) * hypre_boxD3;	\
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);
+
+ #define hypre_newBoxLoop3End(i1, i2, i3)			\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop4Begin(ndim, loop_size,\
+				dbox1, start1, stride1, i1,		\
+				dbox2, start2, stride2, i2,		\
+				dbox3, start3, stride3, i3,		\
+				dbox4, start4, stride4, i4)		\
+ {									\
+  hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     hypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3);	\
+     hypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4);	\
+     Kokkos::parallel_for (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)		\
+     {									\
+	 hypre_newBoxLoopDeclare();					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop4End(i1, i2, i3, i4)		\
+     });							\
+     hypre_fence();					\
+ }
+
+ #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);					\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+
+
+
+ #define hypre_newBoxLoop1ReductionEnd(i1, sum)				\
+     },sum);								\
+     hypre_fence();							\
+     sum += sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					 dbox1, start1, stride1, i1,	\
+					 dbox2, start2, stride2, i2, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 0.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);				\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     hypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+	 HYPRE_Int i1 = 0, i2 = 0;					\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+
+
+ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum)			\
+     },sum);								\
+     hypre_fence();							\
+     sum +=sum_tmp;							\
+ }
+
+ #define hypre_newBoxLoop1ReductionMult(ndim, loop_size,		\
+					dbox1, start1, stride1, i1, xp, sum) \
+ {									\
+     HYPRE_Real sum_tmp = sum;						\
+     sum = 1.0;								\
+     hypre_newBoxLoopInit(ndim,loop_size);						\
+     hypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+     Kokkos::parallel_reduce (hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx,HYPRE_Real &sum) \
+     {									\
+	 hypre_newBoxLoopDeclare()					\
+	 HYPRE_Int hypre_boxD1 = 1;					\
+	 HYPRE_Int i1 = 0;						\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 sum *= xp[i1];							\
+     },sum);								\
+     hypre_fence();							\
+     sum *=sum_tmp;								\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{    														\
+    Kokkos::parallel_for(size, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {
+
+#define hypre_LoopEnd()							\
+    });									\
+    hypre_fence();							\
+}
+  
+#define hypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+    {									\
+       hypre__tot *= loop_size[d];					\
+    }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+        HYPRE_Int i1 = 0;						\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+		
+#define hypre_BoxBoundaryCopyEnd()				\
+	});							\
+	hypre_fence();						\
+}
+
+#define hypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    HYPRE_Int hypre__tot = 1;											\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    for (HYPRE_Int d = 0;d < ndim;d ++)					\
+      {									\
+	hypre__tot *= loop_size[d];					\
+      }									\
+    Kokkos::parallel_for(hypre__tot, KOKKOS_LAMBDA (HYPRE_Int idx)	\
+    {									\
+        hypre_newBoxLoopDeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define hypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+ 
+#define hypre_newBoxLoopSetOneBlock() {}
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock hypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        hypre_newBoxLoop0For
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        hypre_newBoxLoop1For
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        hypre_newBoxLoop2For
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        hypre_newBoxLoop3For
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        hypre_newBoxLoop4For
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+
+//#define hypre_newBoxLoop1ReductionBegin hypre_newBoxLoop1ReductionBegin
+//#define hypre_newBoxLoop1ReductionEnd   hypre_newBoxLoop1ReductionEnd
+//#define hypre_newBoxLoop2ReductionBegin hypre_newBoxLoop2ReductionBegin
+//#define hypre_newBoxLoop2ReductionEnd   hypre_newBoxLoop2ReductionEnd
+//#define hypre_newBoxLoop1ReductionMult hypre_newBoxLoop1ReductionMult
+//#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+//#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+//#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+//#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+
+#endif
diff --git a/src/struct_mv/boxloop_raja.h b/src/struct_mv/boxloop_raja.h
new file mode 100644
index 0000000..80297ef
--- /dev/null
+++ b/src/struct_mv/boxloop_raja.h
@@ -0,0 +1,845 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_NEWBOXLOOP_HEADER
+#define HYPRE_NEWBOXLOOP_HEADER
+
+extern "C++" {
+#include <RAJA/RAJA.hxx>
+}
+using namespace RAJA;
+
+typedef struct hypre_Boxloop_struct
+{
+	HYPRE_Int lsize0,lsize1,lsize2;
+	HYPRE_Int strides0,strides1,strides2;
+	HYPRE_Int bstart0,bstart1,bstart2;
+	HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+#define BLOCKSIZE 256
+
+#if defined(HYPRE_MEMORY_GPU)
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define AxCheckError(err) CheckError(err, __FUNCTION__, __LINE__)
+inline void CheckError(cudaError_t const err, char const* const fun, const HYPRE_Int line)
+{
+    if (err)
+    {
+        printf("CUDA Error Code[%d]: %s\n%s() Line:%d\n", err, cudaGetErrorString(err), fun, line);
+		HYPRE_Int *p = NULL; *p = 1;
+    }
+}
+
+#define hypre_exec_policy cuda_exec<BLOCKSIZE>
+#define hypre_reduce_policy  cuda_reduce_atomic<BLOCKSIZE>
+#define hypre_fence() \
+cudaError err = cudaGetLastError();\
+if ( cudaSuccess != err ) {\
+printf("\n ERROR zypre_newBoxLoop: %s in %s(%d) function %s\n",cudaGetErrorString(err),__FILE__,__LINE__,__FUNCTION__); \
+}\
+AxCheckError(cudaDeviceSynchronize());
+
+#elif defined(HYPRE_USE_OPENMP)
+   #define hypre_exec_policy      omp_for_exec
+   #define hypre_reduce_policy omp_reduce
+   #define hypre_fence() 
+#elif defined(HYPRE_USING_OPENMP_ACC)
+   #define hypre_exec_policy      omp_parallel_for_acc
+   #define hypre_reduce_policy omp_acc_reduce
+#else 
+   #define hypre_exec_policy   seq_exec
+   #define hypre_reduce_policy seq_reduce
+   #define hypre_fence()
+#endif
+
+#define zypre_BoxLoopIncK(k,box,i)					\
+{									\
+   HYPRE_Int idx = idx_local;						\
+   local_idx  = idx % box.lsize0;					\
+   idx        = idx / box.lsize0;					\
+   i += (local_idx*box.strides0 + box.bstart0) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);			\
+   local_idx  = idx % box.lsize1;					\
+   idx        = idx / box.lsize1;					\
+   i += (local_idx*box.strides1 + box.bstart1) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);			\
+   local_idx  = idx % box.lsize2;					\
+   idx  = idx / box.lsize2;					\
+   i += (local_idx*box.strides2 + box.bstart2) * hypre_boxD##k;		\
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);			\
+}
+
+
+#define zypre_BoxLoopCUDAInit(ndim,loop_size)				\
+  HYPRE_Int hypre__tot = 1;						\
+  for (HYPRE_Int i = 0;i < ndim;i ++)					\
+      hypre__tot *= loop_size[i];
+
+
+#define zypre_BoxLoopCUDADeclare()										\
+	HYPRE_Int local_idx;												\
+	HYPRE_Int idx_local = idx;
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)			\
+{									\
+   zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+   forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+   {
+
+
+#define zypre_newBoxLoop0End()					\
+	});											\
+	hypre_fence();      \
+}
+
+#define zypre_BoxLoopDataDeclareK(k,ndim,loop_size,dbox,start,stride)	\
+	hypre_Boxloop databox##k;					\
+	databox##k.lsize0 = loop_size[0];				\
+	databox##k.strides0 = stride[0];				\
+	databox##k.bstart0  = start[0] - dbox->imin[0];			\
+	databox##k.bsize0   = dbox->imax[0]-dbox->imin[0];		\
+	if (ndim > 1)							\
+	{								\
+	    databox##k.lsize1 = loop_size[1];				\
+	    databox##k.strides1 = stride[1];				\
+	    databox##k.bstart1  = start[1] - dbox->imin[1];		\
+	    databox##k.bsize1   = dbox->imax[1]-dbox->imin[1];   	\
+	}								\
+	else						        	\
+	{							       	\
+		databox##k.lsize1 = 1;				       	\
+		databox##k.strides1 = 0;		       		\
+		databox##k.bstart1  = 0;	       			\
+		databox##k.bsize1   = 0;		       		\
+	}								\
+	if (ndim == 3)							\
+	{								\
+	    databox##k.lsize2 = loop_size[2];				\
+	    databox##k.strides2 = stride[2];				\
+	    databox##k.bstart2  = start[2] - dbox->imin[2];		\
+	    databox##k.bsize2   = dbox->imax[2]-dbox->imin[2];   	\
+	}								\
+	else								\
+	{								\
+	    databox##k.lsize2 = 1;					\
+	    databox##k.strides2 = 0;					\
+	    databox##k.bstart2  = 0;					\
+	    databox##k.bsize2   = 0;					\
+	}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1)		\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+      zypre_BoxLoopCUDADeclare();					\
+      HYPRE_Int hypre_boxD1 = 1;					\
+      HYPRE_Int i1 = 0;							\
+      zypre_BoxLoopIncK(1,databox1,i1);
+
+      
+#define zypre_newBoxLoop1End(i1)				\
+	});											\
+    hypre_fence();\
+}
+	
+#define zypre_newBoxLoop2Begin(ndim, loop_size,				\
+                                dbox1, start1, stride1, i1,	\
+                                dbox2, start2, stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+    zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1);	\
+    zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2);	\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1;			\
+		HYPRE_Int i1 = 0, i2 = 0;							\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	
+
+
+#define zypre_newBoxLoop2End(i1, i2)			\
+	});											\
+    hypre_fence();\
+}
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3)		\
+  {									\
+  zypre_BoxLoopCUDAInit(ndim,loop_size);						\
+        zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+        zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+        zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+        forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+	{								\
+	  zypre_BoxLoopCUDADeclare();					\
+	  HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1; \
+	  HYPRE_Int i1 = 0, i2 = 0, i3 = 0;				\
+	  local_idx  = idx_local % databox1.lsize0;				\
+	  idx_local  = idx_local / databox1.lsize0;				\
+	  i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);			\
+	  i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);			\
+	  i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);			\
+	  local_idx  = idx_local % databox1.lsize1;				\
+	  idx_local  = idx_local / databox1.lsize1;				\
+	  i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);			\
+	  i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);			\
+	  i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);			\
+	  local_idx  = idx_local % databox1.lsize2;				\
+	  idx_local  = idx_local / databox1.lsize2;				\
+	  i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1;	\
+	  hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);			\
+	  i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2;	\
+	  hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);			\
+	  i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3;	\
+	  hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);			\
+	  
+
+#define zypre_newBoxLoop3End(i1, i2, i3)			\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,				\
+			       dbox1, start1, stride1, i1,		\
+			       dbox2, start2, stride2, i2,		\
+			       dbox3, start3, stride3, i3,		\
+			       dbox4, start4, stride4, i4)		\
+{								       \
+ zypre_BoxLoopCUDAInit(ndim,loop_size);					       \
+     zypre_BoxLoopDataDeclareK(1,ndim,loop_size,dbox1,start1,stride1); \
+     zypre_BoxLoopDataDeclareK(2,ndim,loop_size,dbox2,start2,stride2); \
+     zypre_BoxLoopDataDeclareK(3,ndim,loop_size,dbox3,start3,stride3); \
+     zypre_BoxLoopDataDeclareK(4,ndim,loop_size,dbox4,start4,stride4); \
+     forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+     {									\
+         zypre_BoxLoopCUDADeclare();					\
+		 HYPRE_Int hypre_boxD1 = 1,hypre_boxD2 = 1,hypre_boxD3 = 1,hypre_boxD4 = 1; \
+	 HYPRE_Int i1 = 0, i2 = 0, i3 = 0,i4 = 0;			\
+	 local_idx  = idx_local % databox1.lsize0;			\
+	 idx_local  = idx_local / databox1.lsize0;			\
+	 i1 += (local_idx*databox1.strides0 + databox1.bstart0) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize0 + 1);		\
+	 i2 += (local_idx*databox2.strides0 + databox2.bstart0) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize0 + 1);		\
+	 i3 += (local_idx*databox3.strides0 + databox3.bstart0) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize0 + 1);		\
+	 i4 += (local_idx*databox4.strides0 + databox4.bstart0) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize0 + 1);		\
+	 local_idx  = idx_local % databox1.lsize1;			\
+	 idx_local  = idx_local / databox1.lsize1;			\
+	 i1 += (local_idx*databox1.strides1 + databox1.bstart1) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize1 + 1);		\
+	 i2 += (local_idx*databox2.strides1 + databox2.bstart1) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize1 + 1);		\
+	 i3 += (local_idx*databox3.strides1 + databox3.bstart1) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize1 + 1);		\
+	 i4 += (local_idx*databox4.strides1 + databox4.bstart1) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize1 + 1);		\
+	 local_idx  = idx_local % databox1.lsize2;			\
+	 idx_local  = idx_local / databox1.lsize2;			\
+	 i1 += (local_idx*databox1.strides2 + databox1.bstart2) * hypre_boxD1; \
+	 hypre_boxD1 *= hypre_max(0, databox1.bsize2 + 1);		\
+	 i2 += (local_idx*databox2.strides2 + databox2.bstart2) * hypre_boxD2; \
+	 hypre_boxD2 *= hypre_max(0, databox2.bsize2 + 1);		\
+	 i3 += (local_idx*databox3.strides2 + databox3.bstart2) * hypre_boxD3; \
+	 hypre_boxD3 *= hypre_max(0, databox3.bsize2 + 1);		\
+	 i4 += (local_idx*databox4.strides2 + databox4.bstart2) * hypre_boxD4; \
+	 hypre_boxD4 *= hypre_max(0, databox4.bsize2 + 1);		\
+	 
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)	\
+  });						\
+  hypre_fence();				\
+}
+
+#define MAX_BLOCK BLOCKSIZE
+
+extern "C++" {
+#if defined(HYPRE_MEMORY_GPU)
+template<class T>
+class ReduceMult   
+{
+public:
+  /*!
+   * \brief Constructor takes initial reduction value (default constructor
+   * is disabled).
+   *
+   * Note: Constructor only executes on the host.
+   */
+  explicit ReduceMult(T init_val)
+  {
+    m_is_copy_host = false;
+    m_myID = getCudaReductionId();
+    getCudaReductionTallyBlock(m_myID,
+                               (void **)&m_tally_host,
+                               (void **)&m_tally_device);
+    m_tally_host->tally = init_val;
+  }
+
+  /*!
+   * \brief Initialize shared memory on device, request shared memory on host.
+   *
+   * Copy constructor executes on both host and device.
+   * On host requests dynamic shared memory and gets offset into dynamic
+   * shared memory if in forall.
+   * On device initializes dynamic shared memory to appropriate value.
+   */
+  RAJA_HOST_DEVICE
+  ReduceMult(const ReduceMult<T> &other)
+  {
+    *this = other;
+#if defined(__CUDA_ARCH__)
+    m_is_copy_device = true;
+    m_finish_reduction = !other.m_is_copy_device;
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    // initialize shared memory
+    T val = static_cast<T>(0);
+    for (HYPRE_Int i = BLOCKSIZE / 2; i > 0; i /= 2) {
+      // this descends all the way to 1
+      if (threadId < i) {
+        sd[threadId + i] = val;
+      }
+    }
+    if (threadId < 1) {
+      sd[threadId] = val;
+    }
+
+    __syncthreads();
+#else
+    m_is_copy_host = true;
+    m_smem_offset = getCudaSharedmemOffset(m_myID, BLOCKSIZE, sizeof(T));
+#endif
+  }
+
+  /*!
+   * \brief Finish reduction on device and free memory on host.
+   *
+   * Destruction on host releases the device memory chunk for
+   * reduction id and id itself for others to use.
+   * Destruction on device completes the reduction.
+   *
+   * Note: destructor executes on both host and device.
+   */
+  RAJA_HOST_DEVICE ~ReduceMult<T>()
+  {
+#if defined(__CUDA_ARCH__)
+    if (m_finish_reduction) {
+      extern __shared__ unsigned char sd_block[];
+      T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+      HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                     + (blockDim.x * blockDim.y) * threadIdx.z;
+
+      T temp = 1;
+      __syncthreads();
+
+      for (HYPRE_Int i = BLOCKSIZE / 2; i >= WARP_SIZE; i /= 2) {
+        if (threadId < i) {
+          sd[threadId] *= sd[threadId + i];
+        }
+        __syncthreads();
+      }
+
+      if (threadId < WARP_SIZE) {
+        temp = sd[threadId];
+        for (HYPRE_Int i = WARP_SIZE / 2; i > 0; i /= 2) {
+          temp *= HIDDEN::shfl_xor<T>(temp, i);
+        }
+      }
+
+      // one thread adds to tally
+      if (threadId == 0) {
+        _atomicAdd<T>(&(m_tally_device->tally), temp);
+      }
+    }
+#else
+    if (!m_is_copy_host) {
+      releaseCudaReductionTallyBlock(m_myID);
+      releaseCudaReductionId(m_myID);
+    }
+#endif
+
+    
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  operator T()
+  {
+    beforeCudaReadTallyBlock<true>(m_myID);
+    return m_tally_host->tally;
+  }
+
+  /*!
+   * \brief Operator that returns reduced sum value.
+   *
+   * Note: accessor only executes on host.
+   */
+  T get() { return operator T(); }
+
+  /*!
+   * \brief Operator that adds value to sum.
+   *
+   * Note: only operates on device.
+   */
+  RAJA_DEVICE ReduceMult<T> const &
+  operator*=(T val) const
+  {
+    extern __shared__ unsigned char sd_block[];
+    T *sd = reinterpret_cast<T *>(&sd_block[m_smem_offset]);
+
+    HYPRE_Int threadId = threadIdx.x + blockDim.x * threadIdx.y
+                   + (blockDim.x * blockDim.y) * threadIdx.z;
+
+    sd[threadId] *= val;
+
+    return *this;
+  }
+
+private:
+  /*!
+   * \brief Default constructor is declared private and not implemented.
+   */
+  ReduceMult<T>();
+
+  /*!
+   * \brief Pointer to host tally block cache slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_host = nullptr;
+
+  /*!
+   * \brief Pointer to device tally block slot for this reduction variable.
+   */
+  CudaReductionTallyTypeAtomic<T> *m_tally_device = nullptr;
+
+  /*!
+   * \brief My cuda reduction variable ID.
+   */
+  HYPRE_Int m_myID = -1;
+
+  /*!
+   * \brief Byte offset into dynamic shared memory.
+   */
+  HYPRE_Int m_smem_offset = -1;
+
+  /*!
+   * \brief If this variable is a copy or not; only original may release memory 
+   *        or perform finalization.
+   */
+  bool m_is_copy_host = false;
+  bool m_is_copy_device = false;
+  bool m_finish_reduction = false;
+
+  // Sanity checks for block size and template type size
+  static constexpr bool powerOfTwoCheck = (!(BLOCKSIZE & (BLOCKSIZE - 1)));
+  static constexpr bool reasonableRangeCheck =
+      ((BLOCKSIZE >= 32) && (BLOCKSIZE <= 1024));
+  static constexpr bool sizeofcheck =
+      ((sizeof(T) <= sizeof(CudaReductionDummyDataType))
+       && (sizeof(CudaReductionTallyType<T>)
+           <= sizeof(CudaReductionDummyTallyType))
+       && (sizeof(CudaReductionBlockType<T>)
+           <= sizeof(CudaReductionDummyBlockType)));
+  static_assert(powerOfTwoCheck, "Error: block sizes must be a power of 2");
+  static_assert(reasonableRangeCheck,
+                "Error: block sizes must be between 32 and 1024");
+  static_assert(sizeofcheck,
+      "Error: type must be of size <= " 
+      RAJA_STRINGIFY_MACRO(RAJA_CUDA_REDUCE_VAR_MAXSIZE));
+};
+#elif defined(HYPRE_USING_OPENMP)
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_val, T initializer = 1)
+        : m_parent(NULL), m_val(init_val), m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+#pragma omp critical
+                {
+                    *m_parent *= m_val;
+                }
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum for current thread.
+        //
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+        
+    };
+#else
+    template <typename T>
+    class ReduceMult
+    {
+        using my_type = ReduceMult;
+        
+    public:
+        //
+        // Constructor takes default value (default ctor is disabled).
+        //
+        explicit ReduceMult(T init_m_val, T initializer = 1) :
+        m_parent(NULL),
+        m_val(init_m_val),
+        m_custom_init(initializer)
+        {
+        }
+        
+        //
+        // Copy ctor.
+        //
+        ReduceMult(const ReduceMult& other) :
+        m_parent(other.m_parent ? other.m_parent : &other),
+        m_val(other.m_custom_init),
+        m_custom_init(other.m_custom_init)
+        {
+        }
+        
+        //
+        // Destruction releases the shared memory block chunk for reduction id
+        // and id itself for others to use.
+        //
+        ~ReduceMult()
+        {
+            if (m_parent) {
+                *m_parent *= m_val;
+            }
+        }
+        
+        //
+        // Operator that returns reduced sum value.
+        //
+        operator T()
+        {
+            return m_val;
+        }
+        
+        //
+        // Method that returns reduced sum value.
+        //
+        T get() { return operator T(); }
+        
+        //
+        // += operator that adds value to sum.
+        //
+        ReduceMult& operator*=(T rhs)
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+        const ReduceMult& operator*=(T rhs) const
+        {
+            this->m_val *= rhs;
+            return *this;
+        }
+        
+    private:
+        //
+        // Default ctor is declared private and not implemented.
+        //
+        ReduceMult();
+        
+        const my_type * m_parent;
+        
+        mutable T m_val;
+        T m_custom_init;
+    };
+#endif
+}
+
+
+#define zypre_newBoxLoop1ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,sum) \
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1,i1)	\
+      {
+
+#define zypre_newBoxLoop1ReductionEnd(i1,sum)				\
+      }									\
+      zypre_newBoxLoop1End(i1);					\
+      hypre_fence();						\
+      sum_tmp = (HYPRE_Real)(sum);				\
+   }								\
+   sum += sum_tmp; \
+}
+		    
+#define zypre_newBoxLoop2ReductionBegin(ndim, loop_size,		\
+					dbox1, start1, stride1, i1,	\
+					dbox2, start2, stride2, i2,sum)	\
+{									\
+   HYPRE_Real sum_tmp;							\
+   {									\
+      ReduceSum< hypre_reduce_policy, HYPRE_Real> sum(0.0);				\
+      zypre_newBoxLoop2Begin(ndim, loop_size, \
+			     dbox1, start1, stride1,i1,\
+			     dbox2, start2, stride2,i2)	\
+      {
+
+#define zypre_newBoxLoop2ReductionEnd(i1,i2,sum)			\
+      }									\
+      zypre_newBoxLoop2End(i1,i2);					\
+      hypre_fence();							\
+      sum_tmp = (HYPRE_Real)(sum);					\
+   }								\
+   sum += sum_tmp; \
+}
+
+#define zypre_newBoxLoop1ReductionMult(ndim, loop_size,				\
+				       dbox1, start1, stride1, i1,xp,sum) \
+{									\
+   ReduceMult<HYPRE_Real> local_result_raja(1.0);				\
+   zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \
+   {									\
+       local_result_raja *= xp[i1];					\
+   }									\
+   zypre_newBoxLoop1End(i1)						\
+   hypre_fence();							\
+   sum *= (HYPRE_Real)(local_result_raja);				\
+}
+
+
+#define hypre_LoopBegin(size,idx)					\
+{									\
+   forall< hypre_exec_policy >(0, size, [=] RAJA_DEVICE (HYPRE_Int idx)	\
+   {
+
+#define hypre_LoopEnd()					\
+   });							\
+   hypre_fence();		\
+}
+  
+#define zypre_BoxBoundaryCopyBegin(ndim, loop_size, stride1, i1, idx) 	\
+{									\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);				\
+    hypre_Boxloop databox1;						\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+        HYPRE_Int i1 = 0;							\
+        local_idx  = idx_local % databox1.lsize0;			\
+        idx_local  = idx_local / databox1.lsize0;			\
+        i1 += local_idx*databox1.strides0;				\
+        local_idx  = idx_local % databox1.lsize1;			\
+        idx_local  = idx_local / databox1.lsize1;			\
+        i1 += local_idx*databox1.strides1;				\
+        local_idx  = idx_local % databox1.lsize2;			\
+        idx_local  = idx_local / databox1.lsize2;			\
+        i1 += local_idx*databox1.strides2;				\
+		
+#define zypre_BoxBoundaryCopyEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_BoxDataExchangeBegin(ndim, loop_size,				\
+                                   stride1, i1,	\
+                                   stride2, i2)	\
+{    														\
+    zypre_BoxLoopCUDAInit(ndim,loop_size);					\
+    hypre_Boxloop databox1,databox2;					\
+    databox1.lsize0 = loop_size[0];					\
+    databox1.lsize1 = loop_size[1];					\
+    databox1.lsize2 = loop_size[2];					\
+    databox1.strides0 = stride1[0];					\
+    databox1.strides1 = stride1[1];					\
+    databox1.strides2 = stride1[2];					\
+    databox2.lsize0 = loop_size[0];					\
+    databox2.lsize1 = loop_size[1];					\
+    databox2.lsize2 = loop_size[2];					\
+    databox2.strides0 = stride2[0];					\
+    databox2.strides1 = stride2[1];					\
+    databox2.strides2 = stride2[2];					\
+    forall< hypre_exec_policy >(0, hypre__tot, [=] RAJA_DEVICE (HYPRE_Int idx) \
+    {									\
+        zypre_BoxLoopCUDADeclare()					\
+	HYPRE_Int i1 = 0, i2 = 0;					\
+	local_idx  = idx_local % databox1.lsize0;			\
+	idx_local  = idx_local / databox1.lsize0;			\
+	i1 += local_idx*databox1.strides0;				\
+	i2 += local_idx*databox2.strides0;				\
+	local_idx  = idx_local % databox1.lsize1;			\
+	idx_local  = idx_local / databox1.lsize1;			\
+	i1 += local_idx*databox1.strides1;				\
+	i2 += local_idx*databox2.strides1;				\
+	local_idx  = idx_local % databox1.lsize2;			\
+	idx_local  = idx_local / databox1.lsize2;			\
+	i1 += local_idx*databox1.strides2;				\
+	i2 += local_idx*databox2.strides2;
+
+
+
+#define zypre_BoxDataExchangeEnd()				\
+	});											\
+	hypre_fence();							\
+}
+
+#define zypre_newBoxLoop0For()
+
+#define zypre_newBoxLoop1For(i1)
+
+#define zypre_newBoxLoop2For(i1, i2) 
+ 
+#define zypre_newBoxLoop3For(i1, i2, i3)
+
+#define zypre_newBoxLoop4For(i1, i2, i3, i4)
+
+#define zypre_newBoxLoopSetOneBlock()
+
+#define hypre_newBoxLoopGetIndex(index)					\
+  index[0] = hypre__i; index[1] = hypre__j; index[2] = hypre__k
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+#define hypre_BoxLoopSetOneBlock zypre_newBoxLoopSetOneBlock
+#define hypre_BoxLoopBlock()       0
+#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
+#define hypre_BoxLoop0For        zypre_newBoxLoop0For
+#define hypre_BoxLoop0End        zypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1For        zypre_newBoxLoop1For
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
+#define hypre_BoxLoop2For        zypre_newBoxLoop2For
+#define hypre_BoxLoop2End        zypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
+#define hypre_BoxLoop3For        zypre_newBoxLoop3For
+#define hypre_BoxLoop3End        zypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
+#define hypre_BoxLoop4For        zypre_newBoxLoop4For
+#define hypre_BoxLoop4End        zypre_newBoxLoop4End
+
+#define hypre_newBoxLoop1ReductionBegin zypre_newBoxLoop1ReductionBegin
+#define hypre_newBoxLoop1ReductionEnd   zypre_newBoxLoop1ReductionEnd
+#define hypre_newBoxLoop2ReductionBegin zypre_newBoxLoop2ReductionBegin
+#define hypre_newBoxLoop2ReductionEnd   zypre_newBoxLoop2ReductionEnd
+#define hypre_newBoxLoop1ReductionMult zypre_newBoxLoop1ReductionMult
+#define hypre_BoxBoundaryCopyBegin zypre_BoxBoundaryCopyBegin
+#define hypre_BoxBoundaryCopyEnd zypre_BoxBoundaryCopyEnd
+#define hypre_BoxDataExchangeBegin zypre_BoxDataExchangeBegin
+#define hypre_BoxDataExchangeEnd zypre_BoxDataExchangeEnd
+#endif
diff --git a/src/struct_mv/communication_info.c b/src/struct_mv/communication_info.c
index 6ad62e5..3b939e5 100644
--- a/src/struct_mv/communication_info.c
+++ b/src/struct_mv/communication_info.c
@@ -431,14 +431,13 @@ hypre_CreateCommInfoFromStencil( hypre_StructGrid      *grid,
       hypre_BoxSetExtents(sbox, istart, istop);
       start = hypre_BoxIMin(sbox);
       hypre_BoxGetSize(sbox, loop_size);
-      hypre_BoxLoop1Begin(ndim, loop_size,
+
+      hypre_SerialBoxLoop1Begin(ndim, loop_size,
                           stencil_box, start, stride, si);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop1For(si)
       {
          stencil_grid[si] = 1;
       }
-      hypre_BoxLoop1End(si);
+      hypre_SerialBoxLoop1End(si);
    }
 
    /*------------------------------------------------------
@@ -816,9 +815,7 @@ hypre_CreateCommInfoFromNumGhost( hypre_StructGrid      *grid,
    size = 0;
    start = hypre_BoxIMin(box);
    hypre_BoxGetSize(box, loop_size);
-   hypre_BoxLoop0Begin(ndim, loop_size);
-   hypre_BoxLoopSetOneBlock();
-   hypre_BoxLoop0For()
+   hypre_SerialBoxLoop0Begin(ndim, loop_size);
    {
       hypre_BoxLoopGetIndex(ii);
       for (d = 0; d < ndim; d++)
@@ -835,7 +832,7 @@ hypre_CreateCommInfoFromNumGhost( hypre_StructGrid      *grid,
       }
       size++;
    }
-   hypre_BoxLoop0End();
+   hypre_SerialBoxLoop0End();
 
    hypre_BoxDestroy(box);
 
diff --git a/src/struct_mv/headers b/src/struct_mv/headers
index 157f067..ecc85d5 100755
--- a/src/struct_mv/headers
+++ b/src/struct_mv/headers
@@ -19,6 +19,9 @@ INTERNAL_HEADER=_hypre_struct_mv.h
 
 cat > $INTERNAL_HEADER <<@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_STRUCT_MV_HEADER
 #define hypre_STRUCT_MV_HEADER
 
@@ -29,25 +32,56 @@ cat > $INTERNAL_HEADER <<@
 #include "HYPRE_struct_mv.h"
 #include "_hypre_utilities.h"
 
+@
+
+
+cat >> $INTERNAL_HEADER <<@
+#if defined(HYPRE_USE_RAJA)
+@
+
+cat boxloop_raja.h                >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#elif defined(HYPRE_USE_KOKKOS)
+@
+
+cat boxloop_kokkos.h              >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#elif defined(HYPRE_USE_CUDA)
+@
+
+cat boxloop_cuda.h                >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#else
+@
+
+cat boxloop.h                     >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+#endif
+@
+
+cat >> $INTERNAL_HEADER <<@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
 @
 
 #===========================================================================
 # Structures and prototypes
 #===========================================================================
 
-cat box.h                  >> $INTERNAL_HEADER	
+cat box.h                  >> $INTERNAL_HEADER  
 cat assumed_part.h         >> $INTERNAL_HEADER
 cat box_manager.h          >> $INTERNAL_HEADER
-cat struct_grid.h          >> $INTERNAL_HEADER	
-cat struct_stencil.h       >> $INTERNAL_HEADER	
-cat struct_communication.h >> $INTERNAL_HEADER	
-cat computation.h          >> $INTERNAL_HEADER	
-cat struct_matrix.h        >> $INTERNAL_HEADER	
-cat struct_vector.h        >> $INTERNAL_HEADER	
+cat struct_grid.h          >> $INTERNAL_HEADER  
+cat struct_stencil.h       >> $INTERNAL_HEADER  
+cat struct_communication.h >> $INTERNAL_HEADER  
+cat computation.h          >> $INTERNAL_HEADER  
+cat struct_matrix.h        >> $INTERNAL_HEADER  
+cat struct_vector.h        >> $INTERNAL_HEADER  
 
 cat protos.h               >> $INTERNAL_HEADER
 
diff --git a/src/struct_mv/protos.h b/src/struct_mv/protos.h
index e78c3b6..6a78373 100644
--- a/src/struct_mv/protos.h
+++ b/src/struct_mv/protos.h
@@ -304,4 +304,4 @@ HYPRE_Int hypre_StructVectorMigrate ( hypre_CommPkg *comm_pkg , hypre_StructVect
 HYPRE_Int hypre_StructVectorPrint ( const char *filename , hypre_StructVector *vector , HYPRE_Int all );
 hypre_StructVector *hypre_StructVectorRead ( MPI_Comm comm , const char *filename , HYPRE_Int *num_ghost );
 HYPRE_Int hypre_StructVectorMaxValue ( hypre_StructVector *vector , HYPRE_Real *max_value , HYPRE_Int *max_index , hypre_Index max_xyz_index );
-
+hypre_StructVector *hypre_StructVectorClone ( hypre_StructVector *vector );
diff --git a/src/struct_mv/struct_axpy.c b/src/struct_mv/struct_axpy.c
index 00ca6a4..a1c2af6 100644
--- a/src/struct_mv/struct_axpy.c
+++ b/src/struct_mv/struct_axpy.c
@@ -29,9 +29,6 @@ hypre_StructAxpy( HYPRE_Complex       alpha,
 {
    hypre_Box        *x_data_box;
    hypre_Box        *y_data_box;
-                 
-   HYPRE_Int         xi;
-   HYPRE_Int         yi;
                     
    HYPRE_Complex    *xp;
    HYPRE_Complex    *yp;
@@ -59,12 +56,17 @@ hypre_StructAxpy( HYPRE_Complex       alpha,
       yp = hypre_StructVectorBoxData(y, i);
 
       hypre_BoxGetSize(box, loop_size);
-
+	  
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
+#endif
+#define HYPRE_BOX_PRIVATE_VAR xi,yi
+	  
       hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size,
-                          x_data_box, start, unit_stride, xi,
-                          y_data_box, start, unit_stride, yi);
+			  x_data_box, start, unit_stride, xi,
+			  y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, yi)
       {
diff --git a/src/struct_mv/struct_communication.c b/src/struct_mv/struct_communication.c
index 801d8a3..016b46b 100644
--- a/src/struct_mv/struct_communication.c
+++ b/src/struct_mv/struct_communication.c
@@ -19,6 +19,14 @@ char       filename[255];
 FILE      *file;
 #endif
 
+/* This is needed to do communication in the GPU case */
+#if defined(HYPRE_MEMORY_GPU)
+static HYPRE_Complex* global_recv_buffer;
+static HYPRE_Complex* global_send_buffer;
+static HYPRE_Int      global_recv_size = 0;
+static HYPRE_Int      global_send_size = 0;
+#endif
+
 /* this computes a (large enough) size (in doubles) for the message prefix */
 #define hypre_CommPrefixSize(ne)                                        \
    ( (((1+ne)*sizeof(HYPRE_Int) + ne*sizeof(hypre_Box))/sizeof(HYPRE_Complex)) + 1 )
@@ -82,7 +90,7 @@ hypre_CommPkgCreate( hypre_CommInfo   *comm_info,
    HYPRE_Int            *send_order;
 
    HYPRE_Int             i, j, k, p, m, size, p_old, my_proc;
-                        
+
    /*------------------------------------------------------
     *------------------------------------------------------*/
 
@@ -624,7 +632,7 @@ hypre_CommTypeSetEntries( hypre_CommType  *comm_type,
    hypre_Box            *box;
    hypre_Box            *data_box;
    HYPRE_Int             i, j;
-                
+
    for (j = 0; j < num_entries; j++)
    {
       i = boxnums[j];
@@ -751,10 +759,10 @@ hypre_CommTypeSetEntry( hypre_Box           *box,
    hypre_CommEntryTypeOffset(comm_entry) = offset;
    hypre_CommEntryTypeDim(comm_entry) = dim;
    hypre_CommEntryTypeOrder(comm_entry) = order;
- 
+
    return hypre_error_flag;
 }
- 
+
 /*--------------------------------------------------------------------------
  * Initialize a non-blocking communication exchange.
  *
@@ -781,19 +789,22 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
    HYPRE_Int            num_sends  = hypre_CommPkgNumSends(comm_pkg);
    HYPRE_Int            num_recvs  = hypre_CommPkgNumRecvs(comm_pkg);
    MPI_Comm             comm       = hypre_CommPkgComm(comm_pkg);
-                     
+
    HYPRE_Int            num_requests;
    hypre_MPI_Request         *requests;
    hypre_MPI_Status          *status;
    HYPRE_Complex      **send_buffers;
    HYPRE_Complex      **recv_buffers;
 
+   HYPRE_Complex      **send_buffers_data;
+   HYPRE_Complex      **recv_buffers_data;
+
    hypre_CommType      *comm_type, *from_type, *to_type;
    hypre_CommEntryType *comm_entry;
    HYPRE_Int            num_entries;
 
    HYPRE_Int           *length_array;
-   HYPRE_Int           *stride_array;
+   HYPRE_Int           *stride_array, unitst_array[HYPRE_MAXDIM+1];
    HYPRE_Int           *order;
 
    HYPRE_Complex       *dptr, *kptr, *lptr;
@@ -801,7 +812,7 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
 
    HYPRE_Int            i, j, d, ll;
    HYPRE_Int            size;
-                      
+
    /*--------------------------------------------------------------------
     * allocate requests and status
     *--------------------------------------------------------------------*/
@@ -828,6 +839,31 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
       }
    }
 
+   /* Prepare send buffers */
+#if defined(HYPRE_MEMORY_GPU)
+   send_buffers_data = hypre_TAlloc(HYPRE_Complex *, num_sends);
+   if (num_sends > 0)
+   {
+      size = hypre_CommPkgSendBufsize(comm_pkg);
+      if (size > global_send_size)
+      {
+         if (global_send_size > 0)
+            hypre_DeviceTFree(global_send_buffer);
+         global_send_buffer = hypre_DeviceCTAlloc(HYPRE_Complex, 5*size);
+         global_send_size   = 5*size;
+      }
+      send_buffers_data[0] = global_send_buffer;
+      for (i = 1; i < num_sends; i++)
+      {
+         comm_type = hypre_CommPkgSendType(comm_pkg, i-1);
+         size = hypre_CommTypeBufsize(comm_type);
+         send_buffers_data[i] = send_buffers_data[i-1] + size;
+      }
+   }
+#else
+   send_buffers_data = send_buffers;
+#endif
+
    /* allocate recv buffers */
    recv_buffers = hypre_TAlloc(HYPRE_Complex *, num_recvs);
    if (num_recvs > 0)
@@ -842,6 +878,31 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
       }
    }
 
+   /* Prepare recv buffers */
+#if defined(HYPRE_MEMORY_GPU)
+   recv_buffers_data = hypre_TAlloc(HYPRE_Complex *, num_recvs);
+   if (num_recvs > 0)
+   {
+      size = hypre_CommPkgRecvBufsize(comm_pkg);
+      if (size > global_recv_size)
+      {
+         if (global_recv_size > 0)
+            hypre_DeviceTFree(global_recv_buffer);
+         global_recv_buffer = hypre_DeviceCTAlloc(HYPRE_Complex, 5*size);
+         global_recv_size   = 5*size;
+      }
+      recv_buffers_data[0] = global_recv_buffer;
+      for (i = 1; i < num_recvs; i++)
+      {
+         comm_type = hypre_CommPkgRecvType(comm_pkg, i-1);
+         size = hypre_CommTypeBufsize(comm_type);
+         recv_buffers_data[i] = recv_buffers_data[i-1] + size;
+      }
+   }
+#else
+   recv_buffers_data = recv_buffers;
+#endif
+
    /*--------------------------------------------------------------------
     * pack send buffers
     *--------------------------------------------------------------------*/
@@ -851,22 +912,9 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
       comm_type = hypre_CommPkgSendType(comm_pkg, i);
       num_entries = hypre_CommTypeNumEntries(comm_type);
 
-      dptr = (HYPRE_Complex *) send_buffers[i];
-
+      dptr = (HYPRE_Complex *) send_buffers_data[i];
       if ( hypre_CommPkgFirstComm(comm_pkg) )
       {
-         qptr = (HYPRE_Int *) send_buffers[i];
-         *qptr = num_entries;
-         qptr ++;
-         memcpy(qptr, hypre_CommTypeRemBoxnums(comm_type),
-                num_entries*sizeof(HYPRE_Int));
-         qptr += num_entries;
-         memcpy(qptr, hypre_CommTypeRemBoxes(comm_type),
-                num_entries*sizeof(hypre_Box));
-
-         hypre_CommTypeRemBoxnums(comm_type) = NULL;
-         hypre_CommTypeRemBoxes(comm_type)   = NULL;
-
          dptr += hypre_CommPrefixSize(num_entries);
       }
 
@@ -876,6 +924,11 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
          length_array = hypre_CommEntryTypeLengthArray(comm_entry);
          stride_array = hypre_CommEntryTypeStrideArray(comm_entry);
          order = hypre_CommEntryTypeOrder(comm_entry);
+         unitst_array[0] = 1;
+         for (d = 1; d <= ndim; d++)
+         {
+            unitst_array[d] = unitst_array[d-1]*length_array[d-1];
+         }
 
          lptr = send_data + hypre_CommEntryTypeOffset(comm_entry);
          for (ll = 0; ll < num_values; ll++)
@@ -884,50 +937,46 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
             {
                kptr = lptr + order[ll]*stride_array[ndim];
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
                /* This is based on "Idea 2" in box.h */
                {
-                  HYPRE_Int      i[HYPRE_MAXDIM+1];
                   HYPRE_Int      n[HYPRE_MAXDIM+1];
                   HYPRE_Int      s[HYPRE_MAXDIM+1];
-                  HYPRE_Complex *p[HYPRE_MAXDIM+1];
-                  HYPRE_Int      I, N;
+                  HYPRE_Int      N;
 
                   /* Initialize */
                   N = 1;
                   for (d = 0; d < ndim; d++)
                   {
-                     i[d] = 0;
                      n[d] = length_array[d];
                      s[d] = stride_array[d];
-                     p[d] = kptr;
                      N *= n[d];
                   }
-                  i[ndim] = 0;
                   n[ndim] = 2;
                   s[ndim] = 0;
-                  p[ndim] = kptr;
 
                   /* Emulate ndim nested for loops */
-                  d = 0;
-                  for (I = 0; I < N; I++)
+                  hypre_BoxBoundaryCopyBegin(ndim, n, s, i, idx)
                   {
-                     dptr[I] = *p[0];
-
-                     while ( (i[d]+2) > n[d] )
-                     {
-                        d++;
-                     }
-                     i[d]++;
-                     p[d] += s[d];
-                     while ( d > 0 )
-                     {
-                        d--;
-                        i[d] = 0;
-                        p[d] = p[d+1];
-                     }
+                     dptr[idx] = kptr[i];
                   }
-                  dptr += N;
+                  hypre_BoxBoundaryCopyEnd();
                }
+#else
+               hypre_BasicBoxLoop2Begin(ndim, length_array,
+                                        stride_array, ki,
+                                        unitst_array, di);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
+#endif
+               hypre_BoxLoop2For(ki, di)
+               {
+                  dptr[di] = kptr[ki];
+               }
+               hypre_BoxLoop2End(ki, di);
+#endif
+
+               dptr += unitst_array[ndim];
             }
             else
             {
@@ -936,13 +985,47 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
                {
                   size *= length_array[d];
                }
-               memset(dptr, 0, size*sizeof(HYPRE_Complex));
+               hypre_DeviceMemset(dptr, 0, HYPRE_Complex, size);
+
                dptr += size;
             }
          }
       }
    }
 
+   /* Copy buffer data from Device to Host */
+#if defined(HYPRE_MEMORY_GPU)
+   if (num_sends > 0)
+   {
+      HYPRE_Complex  *dptr_host;
+      size = hypre_CommPkgSendBufsize(comm_pkg);
+      dptr_host = (HYPRE_Complex *) send_buffers[0];
+      dptr      = (HYPRE_Complex *) send_buffers_data[0];
+      hypre_DataCopyFromData(dptr_host, dptr, HYPRE_Complex, size);
+   }
+#endif
+
+   for (i = 0; i < num_sends; i++)
+   {
+      comm_type = hypre_CommPkgSendType(comm_pkg, i);
+      num_entries = hypre_CommTypeNumEntries(comm_type);
+
+      dptr = (HYPRE_Complex *) send_buffers[i];
+      if ( hypre_CommPkgFirstComm(comm_pkg) )
+      {
+         qptr = (HYPRE_Int *) send_buffers[i];
+         *qptr = num_entries;
+         qptr ++;
+         memcpy(qptr, hypre_CommTypeRemBoxnums(comm_type),
+                num_entries*sizeof(HYPRE_Int));
+         qptr += num_entries;
+         memcpy(qptr, hypre_CommTypeRemBoxes(comm_type),
+                num_entries*sizeof(hypre_Box));
+         hypre_CommTypeRemBoxnums(comm_type) = NULL;
+         hypre_CommTypeRemBoxes(comm_type)   = NULL;
+      }
+   }
+
    /*--------------------------------------------------------------------
     * post receives and initiate sends
     *--------------------------------------------------------------------*/
@@ -1020,6 +1103,8 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
    hypre_CommHandleSendBuffers(comm_handle) = send_buffers;
    hypre_CommHandleRecvBuffers(comm_handle) = recv_buffers;
    hypre_CommHandleAction(comm_handle)      = action;
+   hypre_CommHandleSendBuffersDevice(comm_handle) = send_buffers_data;
+   hypre_CommHandleRecvBuffersDevice(comm_handle) = recv_buffers_data;
 
    *comm_handle_ptr = comm_handle;
 
@@ -1041,7 +1126,7 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
    HYPRE_Complex      **send_buffers = hypre_CommHandleSendBuffers(comm_handle);
    HYPRE_Complex      **recv_buffers = hypre_CommHandleRecvBuffers(comm_handle);
    HYPRE_Int            action       = hypre_CommHandleAction(comm_handle);
-                      
+
    HYPRE_Int            ndim         = hypre_CommPkgNDim(comm_pkg);
    HYPRE_Int            num_values   = hypre_CommPkgNumValues(comm_pkg);
    HYPRE_Int            num_sends    = hypre_CommPkgNumSends(comm_pkg);
@@ -1052,7 +1137,7 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
    HYPRE_Int            num_entries;
 
    HYPRE_Int           *length_array;
-   HYPRE_Int           *stride_array;
+   HYPRE_Int           *stride_array, unitst_array[HYPRE_MAXDIM+1];
 
    HYPRE_Complex       *kptr, *lptr;
    HYPRE_Complex       *dptr;
@@ -1063,6 +1148,11 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
 
    HYPRE_Int            i, j, d, ll;
 
+#if defined(HYPRE_MEMORY_GPU)
+   HYPRE_Complex      **send_buffers_data = hypre_CommHandleSendBuffersDevice(comm_handle);
+#endif
+   HYPRE_Complex      **recv_buffers_data = hypre_CommHandleRecvBuffersDevice(comm_handle);
+
    /*--------------------------------------------------------------------
     * finish communications
     *--------------------------------------------------------------------*/
@@ -1123,12 +1213,36 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
     * unpack receive buffer data
     *--------------------------------------------------------------------*/
 
+   /* Copy buffer data from Host to Device */
+#if defined(HYPRE_MEMORY_GPU)
+   if (num_recvs > 0)
+   {
+      HYPRE_Complex  *dptr_host;
+      HYPRE_Int       size;
+      size = 0;
+      for (i = 0; i < num_recvs; i++)
+      {
+         comm_type = hypre_CommPkgRecvType(comm_pkg, i);
+         num_entries = hypre_CommTypeNumEntries(comm_type);
+         size += hypre_CommTypeBufsize(comm_type);
+         if ( hypre_CommPkgFirstComm(comm_pkg) )
+         {
+            size += hypre_CommPrefixSize(num_entries);
+         }
+      }
+      dptr_host = (HYPRE_Complex *) recv_buffers[0];
+      dptr      = (HYPRE_Complex *) recv_buffers_data[0];
+      hypre_DataCopyToData(dptr_host, dptr, HYPRE_Complex, size);
+   }
+#endif
+
    for (i = 0; i < num_recvs; i++)
    {
       comm_type = hypre_CommPkgRecvType(comm_pkg, i);
       num_entries = hypre_CommTypeNumEntries(comm_type);
 
-      dptr = (HYPRE_Complex *) recv_buffers[i];
+      dptr = (HYPRE_Complex *) recv_buffers_data[i];
+
       if ( hypre_CommPkgFirstComm(comm_pkg) )
       {
          dptr += hypre_CommPrefixSize(num_entries);
@@ -1139,6 +1253,11 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
          comm_entry = hypre_CommTypeEntry(comm_type, j);
          length_array = hypre_CommEntryTypeLengthArray(comm_entry);
          stride_array = hypre_CommEntryTypeStrideArray(comm_entry);
+         unitst_array[0] = 1;
+         for (d = 1; d <= ndim; d++)
+         {
+            unitst_array[d] = unitst_array[d-1]*length_array[d-1];
+         }
 
          lptr = hypre_CommHandleRecvData(comm_handle) +
             hypre_CommEntryTypeOffset(comm_entry);
@@ -1146,59 +1265,60 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
          {
             kptr = lptr + ll*stride_array[ndim];
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS)|| defined(HYPRE_USE_CUDA)
             /* This is based on "Idea 2" in box.h */
             {
-               HYPRE_Int      i[HYPRE_MAXDIM+1];
                HYPRE_Int      n[HYPRE_MAXDIM+1];
                HYPRE_Int      s[HYPRE_MAXDIM+1];
-               HYPRE_Complex *p[HYPRE_MAXDIM+1];
-               HYPRE_Int      I, N;
+               HYPRE_Int      N;
 
                /* Initialize */
                N = 1;
                for (d = 0; d < ndim; d++)
                {
-                  i[d] = 0;
                   n[d] = length_array[d];
                   s[d] = stride_array[d];
-                  p[d] = kptr;
                   N *= n[d];
                }
-               i[ndim] = 0;
                n[ndim] = 2;
                s[ndim] = 0;
-               p[ndim] = kptr;
 
                /* Emulate ndim nested for loops */
-               d = 0;
-               for (I = 0; I < N; I++)
+               hypre_BoxBoundaryCopyBegin(ndim, n, s, i, idx)
                {
                   if (action > 0)
                   {
-                     /* add the data to existing values in memory */
-                     *p[0] += dptr[I];
+                     kptr[i] += dptr[idx];
                   }
                   else
                   {
-                     /* copy the data over existing values in memory */
-                     *p[0] = dptr[I];
-                  }
-
-                  while ( (i[d]+2) > n[d] )
-                  {
-                     d++;
-                  }
-                  i[d]++;
-                  p[d] += s[d];
-                  while ( d > 0 )
-                  {
-                     d--;
-                     i[d] = 0;
-                     p[d] = p[d+1];
+                     kptr[i] = dptr[idx];
                   }
                }
-               dptr += N;
+               hypre_BoxBoundaryCopyEnd();
+            }
+#else
+            hypre_BasicBoxLoop2Begin(ndim, length_array,
+                                     stride_array, ki,
+                                     unitst_array, di);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
+#endif
+            hypre_BoxLoop2For(ki, di)
+            {
+               if (action > 0)
+               {
+                  kptr[ki] += dptr[di];
+               }
+               else
+               {
+                  kptr[ki] = dptr[di];
+               }
             }
+            hypre_BoxLoop2End(ki, di);
+#endif
+
+            dptr += unitst_array[ndim];
          }
       }
    }
@@ -1223,9 +1343,18 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
    {
       hypre_SharedTFree(recv_buffers[0]);
    }
+
+   hypre_TFree(comm_handle);
+
+#if defined(HYPRE_MEMORY_GPU)
    hypre_TFree(send_buffers);
    hypre_TFree(recv_buffers);
-   hypre_TFree(comm_handle);
+   hypre_TFree(send_buffers_data);
+   hypre_TFree(recv_buffers_data);
+#else
+   hypre_TFree(send_buffers);
+   hypre_TFree(recv_buffers);
+#endif
 
    return hypre_error_flag;
 }
@@ -1251,9 +1380,10 @@ hypre_ExchangeLocalData( hypre_CommPkg *comm_pkg,
    HYPRE_Int           *fr_stride_array;
    HYPRE_Complex       *to_dp;
    HYPRE_Int           *to_stride_array;
-                      
+   HYPRE_Complex       *fr_dpl, *to_dpl;
+
    HYPRE_Int           *length_array;
-   HYPRE_Int            i, d, ll;
+   HYPRE_Int            i, ll;
 
    HYPRE_Int           *order;
 
@@ -1285,17 +1415,20 @@ hypre_ExchangeLocalData( hypre_CommPkg *comm_pkg,
          {
             if (order[ll] > -1)
             {
+               fr_dpl = fr_dp + (order[ll])*fr_stride_array[ndim];
+               to_dpl = to_dp + (      ll )*to_stride_array[ndim];
+
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_RAJA) || defined(HYPRE_USE_KOKKOS) || defined(HYPRE_USE_CUDA)
                /* This is based on "Idea 2" in box.h */
                {
-                  HYPRE_Int      i[HYPRE_MAXDIM+1];
+                  //HYPRE_Int      i[HYPRE_MAXDIM+1];
                   HYPRE_Int      n[HYPRE_MAXDIM+1];
                   HYPRE_Int      fs[HYPRE_MAXDIM+1],  ts[HYPRE_MAXDIM+1];
                   HYPRE_Complex *fp[HYPRE_MAXDIM+1], *tp[HYPRE_MAXDIM+1];
-                  HYPRE_Int      I, N;
+                  HYPRE_Int      N,d;
 
                   /* Initialize */
                   N = 1;
-                  i[ndim]  = 0;
                   n[ndim]  = 2;
                   fs[ndim] = 0;
                   ts[ndim] = 0;
@@ -1303,7 +1436,6 @@ hypre_ExchangeLocalData( hypre_CommPkg *comm_pkg,
                   tp[ndim] = to_dp + (      ll )*to_stride_array[ndim];
                   for (d = 0; d < ndim; d++)
                   {
-                     i[d]  = 0;
                      n[d]  = length_array[d];
                      fs[d] = fr_stride_array[d];
                      ts[d] = to_stride_array[d];
@@ -1313,36 +1445,43 @@ hypre_ExchangeLocalData( hypre_CommPkg *comm_pkg,
                   }
 
                   /* Emulate ndim nested for loops */
-                  d = 0;
-                  for (I = 0; I < N; I++)
+                  hypre_BoxDataExchangeBegin(ndim, n, fs, i1, ts, i2)
                   {
                      if (action > 0)
                      {
                         /* add the data to existing values in memory */
-                        *tp[0] += *fp[0];
+                        to_dpl[i2] += fr_dpl[i1];
                      }
                      else
                      {
                         /* copy the data over existing values in memory */
-                        *tp[0] = *fp[0];
-                     }
-
-                     while ( (i[d]+2) > n[d] )
-                     {
-                        d++;
-                     }
-                     i[d]++;
-                     fp[d] += fs[d];
-                     tp[d] += ts[d];
-                     while ( d > 0 )
-                     {
-                        d--;
-                        i[d] = 0;
-                        fp[d] = fp[d+1];
-                        tp[d] = tp[d+1];
+                        to_dpl[i2] = fr_dpl[i1];
                      }
                   }
+                  hypre_BoxDataExchangeEnd();
+               }
+#else
+               hypre_BasicBoxLoop2Begin(ndim, length_array,
+                                        fr_stride_array, fi,
+                                        to_stride_array, ti);
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
+#endif
+               hypre_BoxLoop2For(fi, ti)
+               {
+                  if (action > 0)
+                  {
+                     /* add the data to existing values in memory */
+                     to_dpl[ti] += fr_dpl[fi];
+                  }
+                  else
+                  {
+                     /* copy the data over existing values in memory */
+                     to_dpl[ti] = fr_dpl[fi];
+                  }
                }
+               hypre_BoxLoop2End(fi, ti);
+#endif
             }
          }
       }
diff --git a/src/struct_mv/struct_communication.h b/src/struct_mv/struct_communication.h
index a67a82a..7cdb6fd 100644
--- a/src/struct_mv/struct_communication.h
+++ b/src/struct_mv/struct_communication.h
@@ -141,6 +141,9 @@ typedef struct hypre_CommHandle_struct
    HYPRE_Complex **send_buffers;
    HYPRE_Complex **recv_buffers;
 
+   HYPRE_Complex      **send_buffers_data;
+   HYPRE_Complex      **recv_buffers_data;
+	
    /* set = 0, add = 1 */
    HYPRE_Int       action;
 
@@ -248,5 +251,7 @@ typedef struct hypre_CommHandle_struct
 #define hypre_CommHandleSendBuffers(comm_handle) (comm_handle -> send_buffers)
 #define hypre_CommHandleRecvBuffers(comm_handle) (comm_handle -> recv_buffers)
 #define hypre_CommHandleAction(comm_handle)      (comm_handle -> action)
+#define hypre_CommHandleSendBuffersDevice(comm_handle)    (comm_handle -> send_buffers_data)
+#define hypre_CommHandleRecvBuffersDevice(comm_handle)    (comm_handle -> recv_buffers_data)
 
 #endif
diff --git a/src/struct_mv/struct_copy.c b/src/struct_mv/struct_copy.c
index 83ed596..3901f35 100644
--- a/src/struct_mv/struct_copy.c
+++ b/src/struct_mv/struct_copy.c
@@ -29,9 +29,6 @@ hypre_StructCopy( hypre_StructVector *x,
    hypre_Box       *x_data_box;
    hypre_Box       *y_data_box;
                    
-   HYPRE_Int        xi;
-   HYPRE_Int        yi;
-                   
    HYPRE_Complex   *xp;
    HYPRE_Complex   *yp;
                    
@@ -63,7 +60,7 @@ hypre_StructCopy( hypre_StructVector *x,
                           x_data_box, start, unit_stride, xi,
                           y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi, yi)
       {
@@ -89,9 +86,6 @@ hypre_StructPartialCopy( hypre_StructVector  *x,
    hypre_Box       *x_data_box;
    hypre_Box       *y_data_box;
 
-   HYPRE_Int        xi;
-   HYPRE_Int        yi;
-
    HYPRE_Complex   *xp;
    HYPRE_Complex   *yp;
 
@@ -127,7 +121,7 @@ hypre_StructPartialCopy( hypre_StructVector  *x,
                              x_data_box, start, unit_stride, xi,
                              y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop2For(xi, yi)
          {
diff --git a/src/struct_mv/struct_grid.c b/src/struct_mv/struct_grid.c
index 8a1fe6a..a4f872a 100644
--- a/src/struct_mv/struct_grid.c
+++ b/src/struct_mv/struct_grid.c
@@ -324,9 +324,7 @@ hypre_StructGridAssemble( hypre_StructGrid *grid )
    {
       p = 1;
       hypre_BoxGetSize(box, loop_size);
-      hypre_BoxLoop0Begin(ndim, loop_size);
-      hypre_BoxLoopSetOneBlock();
-      hypre_BoxLoop0For()
+      hypre_SerialBoxLoop0Begin(ndim, loop_size);
       {
          pshift = pshifts[p];
          hypre_BoxLoopGetIndex(pshift);
@@ -345,7 +343,7 @@ hypre_StructGridAssemble( hypre_StructGrid *grid )
             p++;
          }
       }
-      hypre_BoxLoop0End();
+      hypre_SerialBoxLoop0End();
    }
    hypre_BoxDestroy(box);
    
diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c
index a14a71a..957aa4d 100644
--- a/src/struct_mv/struct_innerprod.c
+++ b/src/struct_mv/struct_innerprod.c
@@ -27,15 +27,11 @@ hypre_StructInnerProd( hypre_StructVector *x,
                        hypre_StructVector *y )
 {
    HYPRE_Real       final_innerprod_result;
-   HYPRE_Real       local_result;
    HYPRE_Real       process_result;
                    
    hypre_Box       *x_data_box;
    hypre_Box       *y_data_box;
                    
-   HYPRE_Int        xi;
-   HYPRE_Int        yi;
-                   
    HYPRE_Complex   *xp;
    HYPRE_Complex   *yp;
                    
@@ -44,42 +40,46 @@ hypre_StructInnerProd( hypre_StructVector *x,
    hypre_Index      loop_size;
    hypre_IndexRef   start;
    hypre_Index      unit_stride;
-                   
+    
+   HYPRE_Int         ndim = hypre_StructVectorNDim(x);               
    HYPRE_Int        i;
 
-   local_result = 0.0;
-   process_result = 0.0;
-
+   hypre_Reductioninit(local_result);
+   
    hypre_SetIndex(unit_stride, 1);
-
+   
    boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y));
    hypre_ForBoxI(i, boxes)
    {
       box   = hypre_BoxArrayBox(boxes, i);
       start = hypre_BoxIMin(box);
-
+     
       x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i);
       y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i);
-
+     
       xp = hypre_StructVectorBoxData(x, i);
       yp = hypre_StructVectorBoxData(y, i);
-
+     
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size,
-                          x_data_box, start, unit_stride, xi,
-                          y_data_box, start, unit_stride, yi);
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi,yi) reduction(+:local_result) HYPRE_SMP_SCHEDULE
+#ifdef HYPRE_BOX_PRIVATE_VAR
+#undef HYPRE_BOX_PRIVATE_VAR
 #endif
-      hypre_BoxLoop2For(xi, yi)
+#define HYPRE_BOX_PRIVATE_VAR xi,yi
+#ifdef HYPRE_BOX_REDUCTION
+#undef HYPRE_BOX_REDUCTION
+#endif
+#define HYPRE_BOX_REDUCTION reduction(+:local_result)
+      hypre_newBoxLoop2ReductionBegin(ndim, loop_size,
+                                      x_data_box, start, unit_stride, xi,
+                                      y_data_box, start, unit_stride, yi,local_result);
       {
-         local_result += xp[xi] * hypre_conj(yp[yi]);
+         local_result += xp[xi] * hypre_conj(yp[yi]);		 
       }
-      hypre_BoxLoop2End(xi, yi);
+      hypre_newBoxLoop2ReductionEnd(xi, yi, local_result);
    }
    process_result = local_result;
-
+   
    hypre_MPI_Allreduce(&process_result, &final_innerprod_result, 1,
                        HYPRE_MPI_REAL, hypre_MPI_SUM, hypre_StructVectorComm(x));
 
diff --git a/src/struct_mv/struct_io.c b/src/struct_mv/struct_io.c
index 81fa10f..d13c2c3 100644
--- a/src/struct_mv/struct_io.c
+++ b/src/struct_mv/struct_io.c
@@ -34,7 +34,6 @@ hypre_PrintBoxArrayData( FILE            *file,
    hypre_Box       *data_box;
                    
    HYPRE_Int        data_box_volume;
-   HYPRE_Int        datai;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -43,11 +42,12 @@ hypre_PrintBoxArrayData( FILE            *file,
                    
    HYPRE_Int        i, j, d;
    HYPRE_Complex    value;
-
+   HYPRE_Complex *data_host;
    /*----------------------------------------
     * Print data
     *----------------------------------------*/
-
+   hypre_StructPreparePrint();
+   
    hypre_SetIndex(stride, 1);
 
    hypre_ForBoxI(i, box_array)
@@ -59,10 +59,9 @@ hypre_PrintBoxArrayData( FILE            *file,
       data_box_volume = hypre_BoxVolume(data_box);
 
       hypre_BoxGetSize(box, loop_size);
-
-      hypre_BoxLoop1Begin(dim, loop_size,
-                          data_box, start, stride, datai);
-      hypre_BoxLoop1For(datai)
+ 
+      hypre_SerialBoxLoop1Begin(dim, loop_size,
+                                data_box, start, stride, datai);
       {
          /* Print lines of the form: "%d: (%d, %d, %d; %d) %.14e\n" */
          hypre_BoxLoopGetIndex(index);
@@ -75,7 +74,7 @@ hypre_PrintBoxArrayData( FILE            *file,
                hypre_fprintf(file, ", %d",
                              hypre_IndexD(start, d) + hypre_IndexD(index, d));
             }
-            value = data[datai + j*data_box_volume];
+            value = data_host[datai + j*data_box_volume];
 #ifdef HYPRE_COMPLEX
             hypre_fprintf(file, "; %d) %.14e , %.14e\n",
                           j, hypre_creal(value), hypre_cimag(value));
@@ -84,11 +83,13 @@ hypre_PrintBoxArrayData( FILE            *file,
 #endif
          }
       }
-      hypre_BoxLoop1End(datai);
+      hypre_SerialBoxLoop1End(datai);
 
-      data += num_values*data_box_volume;
+      data_host += num_values*data_box_volume;
    }
 
+   hypre_StructPostPrint();
+   
    return hypre_error_flag;
 }
 
@@ -112,7 +113,7 @@ hypre_PrintCCVDBoxArrayData( FILE            *file,
    hypre_Box       *box;
    hypre_Box       *data_box;
                    
-   HYPRE_Int        data_box_volume, datai;
+   HYPRE_Int        data_box_volume;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -156,9 +157,8 @@ hypre_PrintCCVDBoxArrayData( FILE            *file,
 
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop1Begin(dim, loop_size,
-                          data_box, start, stride, datai);
-      hypre_BoxLoop1For(datai)
+      hypre_SerialBoxLoop1Begin(dim, loop_size,
+				data_box, start, stride, datai);
       {
          /* Print line of the form: "%d: (%d, %d, %d; %d) %.14e\n" */
          hypre_BoxLoopGetIndex(index);
@@ -177,7 +177,7 @@ hypre_PrintCCVDBoxArrayData( FILE            *file,
          hypre_fprintf(file, "; %d) %.14e\n", center_rank, value);
 #endif
       }
-      hypre_BoxLoop1End(datai);
+      hypre_SerialBoxLoop1End(datai);
       data += data_box_volume;
    }
 
@@ -242,7 +242,6 @@ hypre_ReadBoxArrayData( FILE            *file,
    hypre_Box       *data_box;
                    
    HYPRE_Int        data_box_volume;
-   HYPRE_Int        datai;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -266,9 +265,8 @@ hypre_ReadBoxArrayData( FILE            *file,
 
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop1Begin(dim, loop_size,
-                          data_box, start, stride, datai);
-      hypre_BoxLoop1For(datai)
+      hypre_SerialBoxLoop1Begin(dim, loop_size,
+                                data_box, start, stride, datai);
       {
          /* Read lines of the form: "%d: (%d, %d, %d; %d) %le\n" */
          for (j = 0; j < num_values; j++)
@@ -282,7 +280,7 @@ hypre_ReadBoxArrayData( FILE            *file,
                          &idummy, &data[datai + j*data_box_volume]);
          }
       }
-      hypre_BoxLoop1End(datai);
+      hypre_SerialBoxLoop1End(datai);
 
       data += num_values*data_box_volume;
    }
@@ -308,7 +306,6 @@ hypre_ReadBoxArrayData_CC( FILE            *file,
    hypre_Box       *data_box;
                    
    HYPRE_Int        data_box_volume, constant_stencil_size;
-   HYPRE_Int        datai;
                    
    hypre_Index      loop_size;
    hypre_IndexRef   start;
@@ -348,9 +345,8 @@ hypre_ReadBoxArrayData_CC( FILE            *file,
 
       if ( constant_coefficient==2 )
       {
-         hypre_BoxLoop1Begin(dim, loop_size,
-                             data_box, start, stride, datai);
-         hypre_BoxLoop1For(datai)
+         hypre_SerialBoxLoop1Begin(dim, loop_size,
+                                   data_box, start, stride, datai);
          {
             /* Read line of the form: "%d: (%d, %d, %d; %d) %.14e\n" */
             hypre_fscanf(file, "%d: (%d", &idummy, &idummy);
@@ -360,7 +356,7 @@ hypre_ReadBoxArrayData_CC( FILE            *file,
             }
             hypre_fscanf(file, "; %d) %le\n", &idummy, &data[datai]);
          }
-         hypre_BoxLoop1End(datai);
+         hypre_SerialBoxLoop1End(datai);
          data += data_box_volume;
       }
 
diff --git a/src/struct_mv/struct_matrix.c b/src/struct_mv/struct_matrix.c
index 61f8e5c..cc1edcd 100644
--- a/src/struct_mv/struct_matrix.c
+++ b/src/struct_mv/struct_matrix.c
@@ -103,12 +103,18 @@ hypre_StructMatrixDestroy( hypre_StructMatrix *matrix )
       {
          if (hypre_StructMatrixDataAlloced(matrix))
          {
-            hypre_SharedTFree(hypre_StructMatrixData(matrix));
+            hypre_DeviceTFree(hypre_StructMatrixData(matrix));
          }
          hypre_CommPkgDestroy(hypre_StructMatrixCommPkg(matrix));
          
          hypre_ForBoxI(i, hypre_StructMatrixDataSpace(matrix))
-            hypre_TFree(hypre_StructMatrixDataIndices(matrix)[i]);
+	 {    
+            if (hypre_StructMatrixConstantCoefficient(matrix) < 2)
+               hypre_TFree(hypre_StructMatrixDataIndices(matrix)[i]);
+            else
+               hypre_UMTFree(hypre_StructMatrixDataIndices(matrix)[i]); 
+	 }
+	 
          hypre_TFree(hypre_StructMatrixDataIndices(matrix));
          
          hypre_BoxArrayDestroy(hypre_StructMatrixDataSpace(matrix));
@@ -340,7 +346,7 @@ hypre_StructMatrixInitializeShell( hypre_StructMatrix *matrix )
             data_box = hypre_BoxArrayBox(data_space, i);
             data_box_volume  = hypre_BoxVolume(data_box);
 
-            data_indices[i] = hypre_CTAlloc(HYPRE_Int, stencil_size);
+            data_indices[i] = hypre_UMCTAlloc(HYPRE_Int, stencil_size);
 
             /* set pointers for "stored" coefficients */
             for (j = 0; j < stencil_size; j++)
@@ -423,11 +429,17 @@ HYPRE_Int
 hypre_StructMatrixInitialize( hypre_StructMatrix *matrix )
 {
    HYPRE_Complex *data;
-
+   HYPRE_Int constant_coefficient;
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(matrix);
    hypre_StructMatrixInitializeShell(matrix);
 
-   data = hypre_StructMatrixData(matrix);
-   data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   //data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   
+   if (constant_coefficient == 0)
+      data = hypre_DeviceCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   else
+      data = hypre_UMCTAlloc(HYPRE_Complex, hypre_StructMatrixDataSize(matrix));
+   
    hypre_StructMatrixInitializeData(matrix, data);
    hypre_StructMatrixDataAlloced(matrix) = 1;
 
@@ -1007,7 +1019,6 @@ hypre_StructMatrixClearBoxValues( hypre_StructMatrix *matrix,
    hypre_Box           *data_box;
    hypre_IndexRef       data_start;
    hypre_Index          data_stride;
-   HYPRE_Int            datai;
    HYPRE_Complex       *datap;
                    
    hypre_Index          loop_size;
@@ -1074,7 +1085,7 @@ hypre_StructMatrixClearBoxValues( hypre_StructMatrix *matrix,
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size,
                                    data_box,data_start,data_stride,datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(datai)
                {
@@ -1129,7 +1140,7 @@ hypre_StructMatrixAssemble( hypre_StructMatrix *matrix )
    hypre_IndexRef         start;
    hypre_Index            stride;
    HYPRE_Complex         *datap;
-   HYPRE_Int              i, j, ei, datai;
+   HYPRE_Int              i, j, ei;
    HYPRE_Int              num_entries;
    /* End - variables for ghost layer identity code below */
 
@@ -1206,7 +1217,7 @@ hypre_StructMatrixAssemble( hypre_StructMatrix *matrix )
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size,
                                    data_box, start, stride, datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(datai)
                {
@@ -1403,7 +1414,6 @@ hypre_StructMatrixClearGhostValues( hypre_StructMatrix *matrix )
    HYPRE_Int             ndim = hypre_StructMatrixNDim(matrix);
    hypre_Box            *m_data_box;
                         
-   HYPRE_Int             mi;
    HYPRE_Complex        *mp;
 
    hypre_StructStencil  *stencil;
@@ -1451,7 +1461,7 @@ hypre_StructMatrixClearGhostValues( hypre_StructMatrix *matrix )
                hypre_BoxLoop1Begin(hypre_StructMatrixNDim(matrix), loop_size,
                                    m_data_box, start, unit_stride, mi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,mi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(mi)
                {
@@ -1832,7 +1842,7 @@ hypre_StructMatrixClearBoundary( hypre_StructMatrix *matrix)
    hypre_StructStencil *stencil;
    hypre_BoxArray      *boundary;
 
-   HYPRE_Int           i, i2, ixyz, j;
+   HYPRE_Int           i, i2, j;
 
    /*-----------------------------------------------------------------------
     * Set the matrix coefficients
@@ -1857,7 +1867,7 @@ hypre_StructMatrixClearBoundary( hypre_StructMatrix *matrix)
             data_box = hypre_BoxArrayBox(data_space, i);
             boundary = hypre_BoxArrayCreate( 0, ndim );
             hypre_GeneralBoxBoundaryIntersect(grid_box, grid, stencil_element,
-                boundary);
+                                              boundary);
             data = hypre_StructMatrixBoxData(matrix, i, j);
             hypre_ForBoxI(i2, boundary)
             {
@@ -1866,7 +1876,7 @@ hypre_StructMatrixClearBoundary( hypre_StructMatrix *matrix)
                start = hypre_BoxIMin(tmp_box);
                hypre_BoxLoop1Begin(ndim, loop_size, data_box, start, stride, ixyz);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,ixyz) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                hypre_BoxLoop1For(ixyz)
                {
diff --git a/src/struct_mv/struct_matvec.c b/src/struct_mv/struct_matvec.c
index cf080bc..d06a4e5 100644
--- a/src/struct_mv/struct_matvec.c
+++ b/src/struct_mv/struct_matvec.c
@@ -109,8 +109,6 @@ hypre_StructMatvecCompute( void               *matvec_vdata,
    hypre_BoxArrayArray     *compute_box_aa;
    hypre_Box               *y_data_box;
                           
-   HYPRE_Int                yi;
-                          
    HYPRE_Complex           *xp;
    HYPRE_Complex           *yp;
                           
@@ -158,7 +156,7 @@ hypre_StructMatvecCompute( void               *matvec_vdata,
          hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                              y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(yi)
          {
@@ -222,7 +220,7 @@ hypre_StructMatvecCompute( void               *matvec_vdata,
                      hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                                          y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop1For(yi)
                      {
@@ -237,7 +235,7 @@ hypre_StructMatvecCompute( void               *matvec_vdata,
                      hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                                          y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                      hypre_BoxLoop1For(yi)
                      {
@@ -321,8 +319,6 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
    HYPRE_Int                xoff4;
    HYPRE_Int                xoff5;
    HYPRE_Int                xoff6;
-   HYPRE_Int                Ai;
-   HYPRE_Int                xi;
    hypre_BoxArray          *compute_box_a;
    hypre_Box               *compute_box;
                           
@@ -338,7 +334,6 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
    HYPRE_Int                depth;
    hypre_Index              loop_size;
    hypre_IndexRef           start;
-   HYPRE_Int                yi;
    HYPRE_Int                ndim;
 
    stencil       = hypre_StructMatrixStencil(A);
@@ -399,7 +394,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -442,7 +437,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -481,7 +476,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -516,7 +511,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -547,7 +542,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -574,7 +569,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -597,7 +592,7 @@ HYPRE_Int hypre_StructMatvecCC0( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop3For(Ai, xi, yi)
                   {
@@ -666,7 +661,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
    HYPRE_Int                xoff5;
    HYPRE_Int                xoff6;
    HYPRE_Int                Ai;
-   HYPRE_Int                xi;
+
    hypre_BoxArray          *compute_box_a;
    hypre_Box               *compute_box;
                           
@@ -681,7 +676,6 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
    HYPRE_Int                depth;
    hypre_Index              loop_size;
    hypre_IndexRef           start;
-   HYPRE_Int                yi;
    HYPRE_Int                ndim;
 
    stencil       = hypre_StructMatrixStencil(A);
@@ -749,7 +743,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -796,7 +790,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -838,7 +832,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -875,7 +869,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -907,7 +901,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -934,7 +928,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -956,7 +950,7 @@ HYPRE_Int hypre_StructMatvecCC1( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1010,8 +1004,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
    HYPRE_Int                xoff6;
    HYPRE_Int                si_center, center_rank;
    hypre_Index              center_index;
-   HYPRE_Int                Ai, Ai_CC;
-   HYPRE_Int                xi;
+   HYPRE_Int                Ai_CC;
    hypre_BoxArray          *compute_box_a;
    hypre_Box               *compute_box;
                           
@@ -1027,7 +1020,6 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
    HYPRE_Int                depth;
    hypre_Index              loop_size;
    hypre_IndexRef           start;
-   HYPRE_Int                yi;
    HYPRE_Int                ndim;
 
    stencil       = hypre_StructMatrixStencil(A);
@@ -1120,7 +1112,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1180,7 +1172,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1233,7 +1225,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1280,7 +1272,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1321,7 +1313,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1356,7 +1348,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1382,7 +1374,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                       x_data_box, start, stride, xi,
                                       y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
                   hypre_BoxLoop2For(xi, yi)
                   {
@@ -1405,7 +1397,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                 x_data_box, start, stride, xi,
                                 y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, yi)
             {
@@ -1421,7 +1413,7 @@ HYPRE_Int hypre_StructMatvecCC2( HYPRE_Complex       alpha,
                                 x_data_box, start, stride, xi,
                                 y_data_box, start, stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi,xi,Ai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop3For(Ai, xi, yi)
             {
diff --git a/src/struct_mv/struct_scale.c b/src/struct_mv/struct_scale.c
index 3cb3627..2a03e2b 100644
--- a/src/struct_mv/struct_scale.c
+++ b/src/struct_mv/struct_scale.c
@@ -28,7 +28,6 @@ hypre_StructScale( HYPRE_Complex       alpha,
 {
    hypre_Box       *y_data_box;
                    
-   HYPRE_Int        yi;
    HYPRE_Complex   *yp;
                    
    hypre_BoxArray  *boxes;
@@ -55,7 +54,7 @@ hypre_StructScale( HYPRE_Complex       alpha,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(y), loop_size,
                           y_data_box, start, unit_stride, yi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,yi) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(yi)
       {
diff --git a/src/struct_mv/struct_vector.c b/src/struct_mv/struct_vector.c
index 7122be1..38d513c 100644
--- a/src/struct_mv/struct_vector.c
+++ b/src/struct_mv/struct_vector.c
@@ -70,7 +70,7 @@ hypre_StructVectorDestroy( hypre_StructVector *vector )
       {
          if (hypre_StructVectorDataAlloced(vector))
          {
-            hypre_SharedTFree(hypre_StructVectorData(vector));
+            hypre_DeviceTFree(hypre_StructVectorData(vector));
          }
          hypre_TFree(hypre_StructVectorDataIndices(vector));
          hypre_BoxArrayDestroy(hypre_StructVectorDataSpace(vector));
@@ -186,7 +186,8 @@ hypre_StructVectorInitialize( hypre_StructVector *vector )
 
    hypre_StructVectorInitializeShell(vector);
 
-   data = hypre_SharedCTAlloc(HYPRE_Complex, hypre_StructVectorDataSize(vector));
+   data = hypre_DeviceCTAlloc(HYPRE_Complex, hypre_StructVectorDataSize(vector));
+
    hypre_StructVectorInitializeData(vector, data);
    hypre_StructVectorDataAlloced(vector) = 1;
 
@@ -294,13 +295,11 @@ hypre_StructVectorSetBoxValues( hypre_StructVector *vector,
    hypre_Box          *data_box;
    hypre_IndexRef      data_start;
    hypre_Index         data_stride;
-   HYPRE_Int           datai;
    HYPRE_Complex      *datap;
 
    hypre_Box          *dval_box;
    hypre_Index         dval_start;
    hypre_Index         dval_stride;
-   HYPRE_Int           dvali;
 
    hypre_Index         loop_size;
 
@@ -364,7 +363,7 @@ hypre_StructVectorSetBoxValues( hypre_StructVector *vector,
                                 data_box,data_start,data_stride,datai,
                                 dval_box,dval_start,dval_stride,dvali);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai,dvali) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop2For(datai, dvali)
             {
@@ -480,7 +479,6 @@ hypre_StructVectorClearBoxValues( hypre_StructVector *vector,
    hypre_Box          *data_box;
    hypre_IndexRef      data_start;
    hypre_Index         data_stride;
-   HYPRE_Int           datai;
    HYPRE_Complex      *datap;
 
    hypre_Index         loop_size;
@@ -539,7 +537,7 @@ hypre_StructVectorClearBoxValues( hypre_StructVector *vector,
          hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                              data_box,data_start,data_stride,datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(datai)
          {
@@ -562,15 +560,26 @@ hypre_StructVectorClearAllValues( hypre_StructVector *vector )
 {
    HYPRE_Complex *data      = hypre_StructVectorData(vector);
    HYPRE_Int      data_size = hypre_StructVectorDataSize(vector);
-   HYPRE_Int      i;
+   hypre_Index    imin, imax;
+   hypre_Box     *box;
+
+   box = hypre_BoxCreate(1);
+   hypre_IndexD(imin, 0) = 1;
+   hypre_IndexD(imax, 0) = data_size;
+   hypre_BoxSetExtents(box, imin, imax);
 
+   hypre_BoxLoop1Begin(1, imax,
+                       box, imin, imin, datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-   for (i = 0; i < data_size; i++)
+   hypre_BoxLoop1For(datai)
    {
-      data[i] = 0.0;
+      data[datai] = 0.0;
    }
+   hypre_BoxLoop1End(datai);
+   
+   hypre_BoxDestroy(box);
 
    return hypre_error_flag;
 }
@@ -616,7 +625,6 @@ hypre_StructVectorCopy( hypre_StructVector *x,
 {
    hypre_Box          *x_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *xp, *yp;
 
    hypre_BoxArray     *boxes;
@@ -649,7 +657,7 @@ hypre_StructVectorCopy( hypre_StructVector *x,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(x), loop_size,
                           x_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
@@ -670,7 +678,6 @@ hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
 {
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
 
    hypre_BoxArray     *boxes;
@@ -702,7 +709,7 @@ hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(vi)
       {
@@ -727,7 +734,6 @@ hypre_StructVectorSetFunctionValues( hypre_StructVector *vector,
 {
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
 
    hypre_BoxArray     *boxes;
@@ -756,27 +762,28 @@ hypre_StructVectorSetFunctionValues( hypre_StructVector *vector,
  
       hypre_BoxGetSize(box, loop_size);
 
-      hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
-                          v_data_box, start, unit_stride, vi);
       i = hypre_IndexD(start, 0);
       j = hypre_IndexD(start, 1);
       k = hypre_IndexD(start, 2);
+      
+      hypre_SerialBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
+				v_data_box, start, unit_stride, vi);
+
 /* RDF: This won't work as written with threading on */
+       
 #if 0
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE ) HYPRE_SMP_SCHEDULE
 #endif
 #else
-      hypre_BoxLoopSetOneBlock();
 #endif
-      hypre_BoxLoop1For(vi)
       {
          vp[vi] = fcn(i, j, k);
          i++;
          j++;
          k++;
       }
-      hypre_BoxLoop1End(vi);
+      hypre_SerialBoxLoop1End(vi);
    }
 
    return hypre_error_flag;
@@ -791,7 +798,6 @@ hypre_StructVectorClearGhostValues( hypre_StructVector *vector )
    HYPRE_Int           ndim = hypre_StructVectorNDim(vector);
    hypre_Box          *v_data_box;
                     
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
 
    hypre_BoxArray     *boxes;
@@ -830,7 +836,7 @@ hypre_StructVectorClearGhostValues( hypre_StructVector *vector )
          hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                              v_data_box, start, unit_stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
          hypre_BoxLoop1For(vi)
          {
@@ -853,7 +859,6 @@ hypre_StructVectorClearBoundGhostValues( hypre_StructVector *vector,
                                          HYPRE_Int           force )
 {
    HYPRE_Int           ndim = hypre_StructVectorNDim(vector);
-   HYPRE_Int           vi;
    HYPRE_Complex      *vp;
    hypre_BoxArray     *boxes;
    hypre_Box          *box;
@@ -905,7 +910,7 @@ hypre_StructVectorClearBoundGhostValues( hypre_StructVector *vector,
             hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                                 v_data_box, start, stride, vi);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,vi ) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE ) HYPRE_SMP_SCHEDULE
 #endif
             hypre_BoxLoop1For(vi)
             {
@@ -930,7 +935,6 @@ hypre_StructVectorClearBoundGhostValues( hypre_StructVector *vector,
 HYPRE_Int 
 hypre_StructVectorScaleValues( hypre_StructVector *vector, HYPRE_Complex factor )
 {
-   HYPRE_Int         datai;
    HYPRE_Complex    *data;
 
    hypre_Index       imin;
@@ -953,7 +957,7 @@ hypre_StructVectorScaleValues( hypre_StructVector *vector, HYPRE_Complex factor
    hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                        box, imin, imin, datai);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    hypre_BoxLoop1For(datai)
    {
@@ -1180,7 +1184,6 @@ hypre_StructVectorMaxValue( hypre_StructVector *vector,
    A hypre_Index corresponding to max_index is put in max_xyz_index.
    We assume that there is only one box to deal with. */
 {
-   HYPRE_Int         datai;
    HYPRE_Real       *data;
 
    hypre_Index       imin;
@@ -1211,16 +1214,16 @@ hypre_StructVectorMaxValue( hypre_StructVector *vector,
       data = hypre_StructVectorBoxData(vector, i);
       hypre_BoxGetSize(box, loop_size);
       hypre_CopyIndex( hypre_BoxIMin(box), imin );
-
-      hypre_BoxLoop1Begin(ndim, loop_size,
-                          box, imin, unit_stride, datai);
       maxindex = hypre_BoxIndexRank( box, imin );
       maxvalue = data[maxindex];
       hypre_SetIndex(max_xyz_index, 0);
+/*FIXME: must run sequentially*/
+      zypre_BoxLoop1Begin(ndim, loop_size,
+                          box, imin, unit_stride, datai);      
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,datai) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
-      hypre_BoxLoop1For(datai)
+      zypre_BoxLoop1For(datai)
       {
          if ( data[datai] > maxvalue )
          {
@@ -1229,7 +1232,7 @@ hypre_StructVectorMaxValue( hypre_StructVector *vector,
             hypre_BoxLoopGetIndex(max_xyz_index);
          }
       }
-      hypre_BoxLoop1End(datai);
+      zypre_BoxLoop1End(datai);
       hypre_AddIndexes(max_xyz_index, imin, ndim, max_xyz_index);
    }
 
@@ -1245,7 +1248,7 @@ hypre_StructVectorMaxValue( hypre_StructVector *vector,
  *--------------------------------------------------------------------------*/
 hypre_StructVector *
 hypre_StructVectorClone(
-	hypre_StructVector *x)
+   hypre_StructVector *x)
 {
    MPI_Comm		comm = hypre_StructVectorComm(x);
    hypre_StructGrid    *grid = hypre_StructVectorGrid(x);
@@ -1259,11 +1262,12 @@ hypre_StructVectorClone(
 
    hypre_StructVectorDataSize(y) = data_size;
    hypre_StructVectorDataSpace(y) = hypre_BoxArrayDuplicate(data_space);
-   hypre_StructVectorData(y) = hypre_CTAlloc(HYPRE_Complex,data_size);
+   hypre_StructVectorData(y) = hypre_DeviceCTAlloc(HYPRE_Complex, data_size);
+   
    hypre_StructVectorDataIndices(y) = hypre_CTAlloc(HYPRE_Int, data_space_size);
 
    for (i=0; i < data_space_size; i++)
-       hypre_StructVectorDataIndices(y)[i] = data_indices[i];
+      hypre_StructVectorDataIndices(y)[i] = data_indices[i];
 
    hypre_StructVectorCopy( x, y );
 
diff --git a/src/test/Makefile b/src/test/Makefile
index 9ecd6ab..17e8f46 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -10,10 +10,12 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+default:all
 
 include ../config/Makefile.config
 
 CINCLUDES = ${INCLUDES} ${MPIINCLUDE}
+
 CDEFS = -DHYPRE_TIMING -DHYPRE_FORTRAN
 CXXDEFS = -DNOFEI -DHYPRE_TIMING -DMPICH_SKIP_MPICXX
 
@@ -162,6 +164,10 @@ zboxloop: zboxloop.o
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $@.o ${LFLAGS}
 
+struct_newboxloop: struct_newboxloop.o $(KOKKOS_LINK_DEPENDS)
+	@echo  "Building" $@ "... "
+	${LINK_CC} -o $@ $@.o ${LFLAGS}
+
 # RDF: Keep these for now
 
 hypre_set_precond: hypre_set_precond.o
diff --git a/src/test/TEST_examples/complex.jobs b/src/test/TEST_examples/complex.jobs
index 27aeec0..fe76f16 100755
--- a/src/test/TEST_examples/complex.jobs
+++ b/src/test/TEST_examples/complex.jobs
@@ -15,5 +15,5 @@
 # Run complex examples
 #=============================================================================
 
-mpirun -np 16 ex18comp -n 4 > complex.out.1
+mpirun -np 16 ./ex18comp -n 4 > complex.out.1
 
diff --git a/src/test/TEST_examples/maxdim.jobs b/src/test/TEST_examples/maxdim.jobs
index f349ac0..cac0404 100755
--- a/src/test/TEST_examples/maxdim.jobs
+++ b/src/test/TEST_examples/maxdim.jobs
@@ -15,6 +15,6 @@
 # Run maxdim examples
 #=============================================================================
 
-mpirun -np 16 ex17 -n 10 > maxdim.out.1
+mpirun -np 16 ./ex17 -n 10 > maxdim.out.1
 
-mpirun -np 16 ex18 -n 4 > maxdim.out.2
+mpirun -np 16 ./ex18 -n 4 > maxdim.out.2
diff --git a/src/test/TEST_ij/smoother.jobs b/src/test/TEST_ij/smoother.jobs
index bd85133..7876dff 100755
--- a/src/test/TEST_ij/smoother.jobs
+++ b/src/test/TEST_ij/smoother.jobs
@@ -36,6 +36,7 @@
 #   14: Polynomial (Chebyshev 3rd order) on 4 procs
 #   15: FCF Jacobi on 4 procs
 #   16: CG smoother on 4 procs
+#   17-20: Polynomial (Chebyshev 2nd order) with various options
 #=============================================================================
 
 mpirun -np 3 ./ij -rhsrand -n 15 30 10 -w 1.1 -owl 1.0 0 \
@@ -87,3 +88,21 @@ mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 17 -n 20 20 10 -P 2 2 1 \
 mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 15 -n 20 20 10 -P 2 2 1 \
 > smoother.out.15
 
+mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -cheby_scale 0 -n 20 20 20 \
+-P 2 2 1 -27pt > smoother.out.16
+
+mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -cheby_variant 1 -n 20 20 20 \
+-P 2 2 1 > smoother.out.17
+
+mpirun -np 4 ./ij -solver 3 -rlx 16 -cheby_eig_est 0 -n 40 40 20 \
+-P 2 2 1 -difconv -a 10 10 10  > smoother.out.18
+
+mpirun -np 4 ./ij -rhsrand -solver 1 -rlx 16 -rotate -alpha 60 -eps 0.1 -cheby_fraction 0.2 -n 200 200 \
+-P 2 2  > smoother.out.19
+
+mpirun -np 4 ./ij -solver 1 -rlx 16 -cheby_eig_est 5 -n 40 40 20 \
+-P 2 2 1 -vardifconv -eps 0.1  > smoother.out.20
+
+
+
+
diff --git a/src/test/TEST_ij/smoother.saved b/src/test/TEST_ij/smoother.saved
index 914e63b..4564092 100644
--- a/src/test/TEST_ij/smoother.saved
+++ b/src/test/TEST_ij/smoother.saved
@@ -74,3 +74,23 @@ Final Relative Residual Norm = 5.044385e-10
 Iterations = 15
 Final Relative Residual Norm = 5.807749e-09
 
+# Output file: smoother.out.16
+Iterations = 6
+Final Relative Residual Norm = 1.555966e-09
+
+# Output file: smoother.out.17
+Iterations = 7
+Final Relative Residual Norm = 2.088732e-09
+
+# Output file: smoother.out.18
+GMRES Iterations = 11
+Final GMRES Relative Residual Norm = 8.192864e-09
+
+# Output file: smoother.out.19
+Iterations = 6
+Final Relative Residual Norm = 8.887087e-10
+
+# Output file: smoother.out.20
+Iterations = 11
+Final Relative Residual Norm = 3.089502e-09
+
diff --git a/src/test/TEST_ij/smoother.sh b/src/test/TEST_ij/smoother.sh
index 35b3fe5..a9a9542 100755
--- a/src/test/TEST_ij/smoother.sh
+++ b/src/test/TEST_ij/smoother.sh
@@ -43,6 +43,11 @@ FILES="\
  ${TNAME}.out.13\
  ${TNAME}.out.14\
  ${TNAME}.out.15\
+ ${TNAME}.out.16\
+ ${TNAME}.out.17\
+ ${TNAME}.out.18\
+ ${TNAME}.out.19\
+ ${TNAME}.out.20\
 "
 
 for i in $FILES
diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_ij/solvers.jobs
index ac2aee8..ae37d4c 100755
--- a/src/test/TEST_ij/solvers.jobs
+++ b/src/test/TEST_ij/solvers.jobs
@@ -82,6 +82,9 @@ mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add
 mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111
 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112
 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120
 
 #nonGalerkin version
 mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
diff --git a/src/test/TEST_ij/solvers.saved b/src/test/TEST_ij/solvers.saved
index 192947e..a3ef503 100644
--- a/src/test/TEST_ij/solvers.saved
+++ b/src/test/TEST_ij/solvers.saved
@@ -143,3 +143,15 @@ Final GMRES Relative Residual Norm = 9.500151e-09
 GMRES Iterations = 10
 Final GMRES Relative Residual Norm = 1.006494e-09
 
+# Output file: solvers.out.118
+GMRES Iterations = 25
+Final GMRES Relative Residual Norm = 9.464475e-09
+
+# Output file: solvers.out.119
+GMRES Iterations = 23
+Final GMRES Relative Residual Norm = 9.269998e-09
+
+# Output file: solvers.out.120
+GMRES Iterations = 17
+Final GMRES Relative Residual Norm = 3.995718e-09
+
diff --git a/src/test/TEST_ij/solvers.sh b/src/test/TEST_ij/solvers.sh
index c9ac7ec..963e4db 100755
--- a/src/test/TEST_ij/solvers.sh
+++ b/src/test/TEST_ij/solvers.sh
@@ -96,6 +96,9 @@ FILES="\
  ${TNAME}.out.115\
  ${TNAME}.out.116\
  ${TNAME}.out.117\
+ ${TNAME}.out.118\
+ ${TNAME}.out.119\
+ ${TNAME}.out.120\
 "
 
 for i in $FILES
diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_longdouble/solvers_ij.jobs
similarity index 50%
copy from src/test/TEST_ij/solvers.jobs
copy to src/test/TEST_longdouble/solvers_ij.jobs
index ac2aee8..5bb5b4c 100755
--- a/src/test/TEST_ij/solvers.jobs
+++ b/src/test/TEST_longdouble/solvers_ij.jobs
@@ -44,49 +44,49 @@
 #
 #=============================================================================
 
-mpirun -np 2 ./ij -solver 1 -rhsrand > solvers.out.0
-mpirun -np 2 ./ij -solver 2 -rhsrand > solvers.out.1
-mpirun -np 2 ./ij -solver 3 -rhsrand > solvers.out.2
-mpirun -np 2 ./ij -solver 4 -rhsrand > solvers.out.3
-mpirun -np 2 ./ij -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4
-mpirun -np 2 ./ij -solver 6 -rhsrand > solvers.out.5
-mpirun -np 2 ./ij -solver 7 -rhsrand > solvers.out.6
-mpirun -np 2 ./ij -solver 8 -rhsrand > solvers.out.7
-mpirun -np 2 ./ij -solver 20 -rhsrand > solvers.out.8
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand > solvers.out.9
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11
+mpirun -np 2 ./ij -solver 1 -tol 1.e-16 -rhsrand > solvers_ij.out.0
+mpirun -np 2 ./ij -solver 2 -tol 1.e-16 -rhsrand > solvers_ij.out.1
+mpirun -np 2 ./ij -solver 3 -tol 1.e-16 -rhsrand > solvers_ij.out.2
+mpirun -np 2 ./ij -solver 4 -tol 1.e-16 -rhsrand > solvers_ij.out.3
+mpirun -np 2 ./ij -solver 5 -tol 1.e-16 -rhsrand -w 0.67 -ns 2 > solvers_ij.out.4
+mpirun -np 2 ./ij -solver 6 -tol 1.e-16 -rhsrand > solvers_ij.out.5
+mpirun -np 2 ./ij -solver 7 -tol 1.e-16 -rhsrand > solvers_ij.out.6
+mpirun -np 2 ./ij -solver 8 -tol 1.e-16 -rhsrand > solvers_ij.out.7
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -rhsrand > solvers_ij.out.8
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand > solvers_ij.out.9
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand -solver_type 2 > solvers_ij.out.10
+mpirun -np 2 ./ij -solver 20 -tol 1.e-16 -cf 0.5 -rhsrand -solver_type 3 > solvers_ij.out.11
 
 #systems AMG run ...unknown approach, hybrid approach, nodal approach
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -tol 1.e-16 > solvers_ij.out.sysu
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 -tol 1.e-16 > solvers_ij.out.sysh
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 -tol 1.e-16 > solvers_ij.out.sysn
 
 #LGMRS and FlexGMRES
-mpirun -np 2 ./ij -solver 50 -rhsrand > solvers.out.101
-mpirun -np 2 ./ij -solver 51 -rhsrand > solvers.out.102
-mpirun -np 2 ./ij -solver 60 -rhsrand > solvers.out.103
-mpirun -np 2 ./ij -solver 61 -rhsrand > solvers.out.104
+mpirun -np 2 ./ij -solver 50 -tol 1.e-16 -rhsrand > solvers_ij.out.101
+mpirun -np 2 ./ij -solver 51 -tol 1.e-16 -rhsrand > solvers_ij.out.102
+mpirun -np 2 ./ij -solver 60 -tol 1.e-16 -rhsrand > solvers_ij.out.103
+mpirun -np 2 ./ij -solver 61 -tol 1.e-16 -rhsrand > solvers_ij.out.104
 
 #agglomerated coarse grid solve
-mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -tol 1.e-16 > solvers_ij.out.105
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -tol 1.e-16 > solvers_ij.out.107
 
 #redundant coarse grid solve
-mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 -tol 1.e-16 > solvers_ij.out.106
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 -tol 1.e-16 > solvers_ij.out.108
 
 #additive cycles
-mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109
-mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 > solvers.out.110
-mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 -tol 1.e-16 > solvers_ij.out.109
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 -tol 1.e-16 > solvers_ij.out.110
+mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 -tol 1.e-16 > solvers_ij.out.111
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 -tol 1.e-16 > solvers_ij.out.112
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -tol 1.e-16 > solvers_ij.out.113
 
 #nonGalerkin version
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 -tol 1.e-16 > solvers_ij.out.114
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 -tol 1.e-16 > solvers_ij.out.115
 
 #RAP options
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 -tol 1.e-16 > solvers_ij.out.116
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 -tol 1.e-16 > solvers_ij.out.117
diff --git a/src/test/TEST_longdouble/solvers_ij.saved b/src/test/TEST_longdouble/solvers_ij.saved
new file mode 100644
index 0000000..54fa979
--- /dev/null
+++ b/src/test/TEST_longdouble/solvers_ij.saved
@@ -0,0 +1,145 @@
+# Output file: solvers_ij.out.0
+Iterations = 14
+Final Relative Residual Norm = 7.837531e-18
+
+# Output file: solvers_ij.out.1
+Iterations = 65
+Final Relative Residual Norm = 5.127733e-17
+
+# Output file: solvers_ij.out.2
+GMRES Iterations = 14
+Final GMRES Relative Residual Norm = 2.081070e-17
+
+# Output file: solvers_ij.out.3
+GMRES Iterations = 199
+Final GMRES Relative Residual Norm = 9.121629e-17
+
+# Output file: solvers_ij.out.4
+Iterations = 18
+Final Relative Residual Norm = 2.567047e-17
+
+# Output file: solvers_ij.out.5
+Iterations = 375
+Final Relative Residual Norm = 9.100420e-17
+
+# Output file: solvers_ij.out.6
+GMRES Iterations = 65
+Final GMRES Relative Residual Norm = 9.781966e-17
+
+# Output file: solvers_ij.out.7
+Iterations = 42
+Final Relative Residual Norm = 4.868667e-17
+
+# Output file: solvers_ij.out.8
+Iterations = 65
+PCG_Iterations = 0
+DSCG_Iterations = 65
+Final Relative Residual Norm = 5.127733e-17
+
+# Output file: solvers_ij.out.9
+Iterations = 19
+PCG_Iterations = 15
+DSCG_Iterations = 4
+Final Relative Residual Norm = 4.300066e-17
+
+# Output file: solvers_ij.out.10
+Iterations = 19
+PCG_Iterations = 17
+DSCG_Iterations = 2
+Final Relative Residual Norm = 1.715691e-17
+
+# Output file: solvers_ij.out.11
+Iterations = 12
+PCG_Iterations = 9
+DSCG_Iterations = 3
+Final Relative Residual Norm = 5.036490e-18
+
+# Output file: solvers_ij.out.sysh
+ Average Convergence Factor = 0.122527
+
+     Complexity:    grid = 1.613750
+                operator = 2.860373
+                   cycle = 5.720578
+
+# Output file: solvers_ij.out.sysn
+ Average Convergence Factor = 0.222219
+
+     Complexity:    grid = 1.592000
+                operator = 2.633619
+                   cycle = 11.267164
+
+# Output file: solvers_ij.out.sysu
+ Average Convergence Factor = 0.430735
+
+     Complexity:    grid = 1.614937
+                operator = 2.866488
+                   cycle = 5.732598
+
+# Output file: solvers_ij.out.101
+LGMRES Iterations = 175
+Final LGMRES Relative Residual Norm = 9.264825e-17
+
+# Output file: solvers_ij.out.102
+LGMRES Iterations = 17
+Final LGMRES Relative Residual Norm = 2.721454e-17
+
+# Output file: solvers_ij.out.103
+FlexGMRES Iterations = 199
+Final FlexGMRES Relative Residual Norm = 9.121063e-17
+
+# Output file: solvers_ij.out.104
+FlexGMRES Iterations = 14
+Final FlexGMRES Relative Residual Norm = 2.081457e-17
+
+# Output file: solvers_ij.out.105
+Iterations = 27
+Final Relative Residual Norm = 4.195866e-17
+
+# Output file: solvers_ij.out.106
+Iterations = 27
+Final Relative Residual Norm = 4.195866e-17
+
+# Output file: solvers_ij.out.107
+Iterations = 41
+Final Relative Residual Norm = 5.542245e-17
+
+# Output file: solvers_ij.out.108
+Iterations = 41
+Final Relative Residual Norm = 5.542245e-17
+
+# Output file: solvers_ij.out.109
+Iterations = 32
+Final Relative Residual Norm = 8.814032e-17
+
+# Output file: solvers_ij.out.110
+Iterations = 32
+Final Relative Residual Norm = 8.814032e-17
+
+# Output file: solvers_ij.out.111
+Iterations = 56
+Final Relative Residual Norm = 8.514971e-17
+
+# Output file: solvers_ij.out.112
+GMRES Iterations = 43
+Final GMRES Relative Residual Norm = 8.418244e-17
+
+# Output file: solvers_ij.out.113
+GMRES Iterations = 50
+Final GMRES Relative Residual Norm = 7.078678e-17
+
+# Output file: solvers_ij.out.114
+BoomerAMG Iterations = 33
+Final Relative Residual Norm = 3.984250e-17
+
+# Output file: solvers_ij.out.115
+BoomerAMG Iterations = 33
+Final Relative Residual Norm = 8.096418e-17
+
+# Output file: solvers_ij.out.116
+GMRES Iterations = 18
+Final GMRES Relative Residual Norm = 3.003469e-17
+
+# Output file: solvers_ij.out.117
+GMRES Iterations = 18
+Final GMRES Relative Residual Norm = 3.024076e-17
+
diff --git a/src/test/TEST_ij/solvers.sh b/src/test/TEST_longdouble/solvers_ij.sh
similarity index 100%
copy from src/test/TEST_ij/solvers.sh
copy to src/test/TEST_longdouble/solvers_ij.sh
diff --git a/src/test/TEST_longdouble/solvers_struct.jobs b/src/test/TEST_longdouble/solvers_struct.jobs
new file mode 100644
index 0000000..ff57936
--- /dev/null
+++ b/src/test/TEST_longdouble/solvers_struct.jobs
@@ -0,0 +1,63 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+
+
+#=============================================================================
+# struct: Run SMG-CG, PFMG-CG, DSCG, CG, and Hybrid.
+#=============================================================================
+
+mpirun -np 3 ./struct -P 1 1 3 -solver 10 -tol 1.0e-16 > solvers_struct.out.0
+mpirun -np 3 ./struct -P 1 3 1 -solver 11 -tol 1.0e-16 > solvers_struct.out.1
+mpirun -np 3 ./struct -P 3 1 1 -solver 17 -tol 1.0e-16 > solvers_struct.out.2
+mpirun -np 1 ./struct -P 1 1 1 -solver 18 -tol 1.0e-16 > solvers_struct.out.3
+mpirun -np 1 ./struct -P 1 1 1 -solver 19 -tol 1.0e-16 > solvers_struct.out.4
+
+
+#=============================================================================
+# Run default case with all available PCG preconditioners (solvers): 
+#    10: SMG (default)
+#    11: PFMG
+#    17: 2-step Jacobi
+#    18: Diagonal scaling
+#    19: none
+#=============================================================================
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 10 -tol 1.e-16 > solvers_struct.out.10.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-16 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.10.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-16 -pcgitr 0  -seed 1 -vrand 5  > solvers_struct.out.10.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 11 -tol 1.e-16  > solvers_struct.out.11.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-16  -pcgitr 0 -seed 1 -vrand 1  > solvers_struct.out.11.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-16  -pcgitr 0 -seed 1 -vrand 5  > solvers_struct.out.11.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 17 -tol 1.e-16 > solvers_struct.out.17.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.17.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 5  > solvers_struct.out.17.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 18  -tol 1.e-16 > solvers_struct.out.18.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.18.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 5  > solvers_struct.out.18.lobpcg.5
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 5 eigenpairs
+mpirun -np 2 ./struct -solver 19  1.e-16 > solvers_struct.out.19.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.19.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-16  -pcgitr 10 -seed 1 -vrand 5  > solvers_struct.out.19.lobpcg.5
diff --git a/src/test/TEST_longdouble/solvers_struct.saved b/src/test/TEST_longdouble/solvers_struct.saved
new file mode 100644
index 0000000..e7848d9
--- /dev/null
+++ b/src/test/TEST_longdouble/solvers_struct.saved
@@ -0,0 +1,120 @@
+# Output file: solvers_struct.out.0
+Iterations = 10
+Final Relative Residual Norm = 6.581106e-18
+
+# Output file: solvers_struct.out.1
+Iterations = 21
+Final Relative Residual Norm = 1.596699e-17
+
+# Output file: solvers_struct.out.2
+Iterations = 38
+Final Relative Residual Norm = 7.177049e-17
+
+# Output file: solvers_struct.out.3
+Iterations = 34
+Final Relative Residual Norm = 3.265691e-18
+
+# Output file: solvers_struct.out.4
+Iterations = 34
+Final Relative Residual Norm = 3.219688e-18
+
+# Output file: solvers_struct.out.10.lobpcg
+Iterations = 9
+Final Relative Residual Norm = 8.224720e-17
+
+# Output file: solvers_struct.out.10.lobpcg.1
+Eigenvalue lambda   1.84366453091756e-01
+Residual   6.51777159501587e-08
+
+# Output file: solvers_struct.out.10.lobpcg.5
+Eigenvalue lambda   1.84366453091754e-01
+Eigenvalue lambda   2.50882493969729e-01
+Eigenvalue lambda   3.60090369737175e-01
+Eigenvalue lambda   4.20845334658399e-01
+Eigenvalue lambda   4.20845334658517e-01
+Residual   3.01898591717032e-08
+Residual   4.55422767070726e-08
+Residual   3.17971068902989e-07
+Residual   1.36013718828342e-07
+Residual   3.39522008905860e-07
+
+# Output file: solvers_struct.out.11.lobpcg
+Iterations = 20
+Final Relative Residual Norm = 4.957736e-17
+
+# Output file: solvers_struct.out.11.lobpcg.1
+Eigenvalue lambda   1.84366453091756e-01
+Residual   5.90808378148053e-08
+
+# Output file: solvers_struct.out.11.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969730e-01
+Eigenvalue lambda   3.60090369737179e-01
+Eigenvalue lambda   4.20845334658427e-01
+Eigenvalue lambda   4.20845334658490e-01
+Residual   3.21720429477434e-08
+Residual   6.44224703485687e-08
+Residual   2.42812454193770e-07
+Residual   3.37502013163608e-07
+Residual   3.28980662738132e-07
+
+# Output file: solvers_struct.out.17.lobpcg
+Iterations = 34
+Final Relative Residual Norm = 4.087402e-17
+
+# Output file: solvers_struct.out.17.lobpcg.1
+Eigenvalue lambda   1.84366453091770e-01
+Residual   1.03034775754103e-07
+
+# Output file: solvers_struct.out.17.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969730e-01
+Eigenvalue lambda   3.60090369737173e-01
+Eigenvalue lambda   4.20845334658410e-01
+Eigenvalue lambda   4.20845334658417e-01
+Residual   2.01876274956198e-08
+Residual   2.36598865463032e-07
+Residual   1.80211165362944e-07
+Residual   1.64192171824644e-07
+Residual   1.88239862086799e-07
+
+# Output file: solvers_struct.out.18.lobpcg
+Iterations = 59
+Final Relative Residual Norm = 5.038238e-17
+
+# Output file: solvers_struct.out.18.lobpcg.1
+Eigenvalue lambda   1.84366453091755e-01
+Residual   8.10727379995405e-08
+
+# Output file: solvers_struct.out.18.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969763e-01
+Eigenvalue lambda   3.60090369737173e-01
+Eigenvalue lambda   4.20845334658457e-01
+Eigenvalue lambda   4.20845334658496e-01
+Residual   9.97341733929215e-08
+Residual   1.68228045654293e-07
+Residual   1.17106600441032e-07
+Residual   3.61327412593828e-07
+Residual   3.08969359498729e-07
+
+# Output file: solvers_struct.out.19.lobpcg
+Iterations = 32
+Final Relative Residual Norm = 8.259590e-07
+
+# Output file: solvers_struct.out.19.lobpcg.1
+Eigenvalue lambda   1.84366453091755e-01
+Residual   8.10727379999456e-08
+
+# Output file: solvers_struct.out.19.lobpcg.5
+Eigenvalue lambda   1.84366453091753e-01
+Eigenvalue lambda   2.50882493969682e-01
+Eigenvalue lambda   3.60090369737172e-01
+Eigenvalue lambda   4.20845334658445e-01
+Eigenvalue lambda   4.20845334658495e-01
+Residual   9.97341734185370e-08
+Residual   1.68228046457913e-07
+Residual   1.17106597195939e-07
+Residual   3.61311195302798e-07
+Residual   3.14060616265268e-07
+
diff --git a/src/test/TEST_ij/smoother.sh b/src/test/TEST_longdouble/solvers_struct.sh
similarity index 77%
copy from src/test/TEST_ij/smoother.sh
copy to src/test/TEST_longdouble/solvers_struct.sh
index 35b3fe5..d93c101 100755
--- a/src/test/TEST_ij/smoother.sh
+++ b/src/test/TEST_longdouble/solvers_struct.sh
@@ -11,8 +11,19 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+
+
+
+
 TNAME=`basename $0 .sh`
+CONVTOL=$1
 
+# Set default check tolerance
+if [ x$CONVTOL = "x" ];
+then
+    CONVTOL=0.0
+fi
+#echo "tol = $CONVTOL"
 #=============================================================================
 # compare with baseline case
 #=============================================================================
@@ -22,37 +33,35 @@ FILES="\
  ${TNAME}.out.1\
  ${TNAME}.out.2\
  ${TNAME}.out.3\
+ ${TNAME}.out.4\
 "
 
 for i in $FILES
 do
   echo "# Output file: $i"
-  tail -21 $i | head -6
+  tail -3 $i
 done > ${TNAME}.out
 
 FILES="\
- ${TNAME}.out.4\
- ${TNAME}.out.5\
- ${TNAME}.out.6\
- ${TNAME}.out.7\
- ${TNAME}.out.8\
- ${TNAME}.out.9\
- ${TNAME}.out.10\
- ${TNAME}.out.11\
- ${TNAME}.out.12\
- ${TNAME}.out.13\
- ${TNAME}.out.14\
- ${TNAME}.out.15\
+ ${TNAME}.out.10.lobpcg\
+ ${TNAME}.out.11.lobpcg\
+ ${TNAME}.out.17.lobpcg\
+ ${TNAME}.out.18.lobpcg\
+ ${TNAME}.out.19.lobpcg\
 "
 
 for i in $FILES
 do
   echo "# Output file: $i"
   tail -3 $i
+  echo "# Output file: $i.1"
+  tail -13 $i.1 | head -3
+  echo "# Output file: $i.5"
+  tail -21 $i.5 | head -11
 done >> ${TNAME}.out
 
 # Make sure that the output files are reasonable
-CHECK_LINE="Complexity"
+CHECK_LINE="Iterations"
 OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l`
 SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l`
 if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
@@ -60,7 +69,7 @@ if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
 fi
 
 if [ -z $HYPRE_NO_SAVED ]; then
-   diff -U3 -bI"time" ${TNAME}.saved ${TNAME}.out >&2
+   (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2
 fi
 
 #=============================================================================
diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_single/solvers_ij.jobs
similarity index 50%
copy from src/test/TEST_ij/solvers.jobs
copy to src/test/TEST_single/solvers_ij.jobs
index ac2aee8..e0c0a72 100755
--- a/src/test/TEST_ij/solvers.jobs
+++ b/src/test/TEST_single/solvers_ij.jobs
@@ -44,49 +44,49 @@
 #
 #=============================================================================
 
-mpirun -np 2 ./ij -solver 1 -rhsrand > solvers.out.0
-mpirun -np 2 ./ij -solver 2 -rhsrand > solvers.out.1
-mpirun -np 2 ./ij -solver 3 -rhsrand > solvers.out.2
-mpirun -np 2 ./ij -solver 4 -rhsrand > solvers.out.3
-mpirun -np 2 ./ij -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4
-mpirun -np 2 ./ij -solver 6 -rhsrand > solvers.out.5
-mpirun -np 2 ./ij -solver 7 -rhsrand > solvers.out.6
-mpirun -np 2 ./ij -solver 8 -rhsrand > solvers.out.7
-mpirun -np 2 ./ij -solver 20 -rhsrand > solvers.out.8
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand > solvers.out.9
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11
+mpirun -np 2 ./ij -solver 1 -tol 1.e-4 -rhsrand > solvers_ij.out.0
+mpirun -np 2 ./ij -solver 2 -tol 1.e-4 -rhsrand > solvers_ij.out.1
+mpirun -np 2 ./ij -solver 3 -tol 1.e-4 -rhsrand > solvers_ij.out.2
+mpirun -np 2 ./ij -solver 4 -tol 1.e-4 -rhsrand > solvers_ij.out.3
+mpirun -np 2 ./ij -solver 5 -tol 1.e-4 -rhsrand -w 0.67 -ns 2 > solvers_ij.out.4
+mpirun -np 2 ./ij -solver 6 -tol 1.e-4 -rhsrand > solvers_ij.out.5
+mpirun -np 2 ./ij -solver 7 -tol 1.e-4 -rhsrand > solvers_ij.out.6
+mpirun -np 2 ./ij -solver 8 -tol 1.e-4 -rhsrand > solvers_ij.out.7
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -rhsrand > solvers_ij.out.8
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand > solvers_ij.out.9
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand -solver_type 2 > solvers_ij.out.10
+mpirun -np 2 ./ij -solver 20 -tol 1.e-4 -cf 0.5 -rhsrand -solver_type 3 > solvers_ij.out.11
 
 #systems AMG run ...unknown approach, hybrid approach, nodal approach
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -tol 1.e-4 > solvers_ij.out.sysu
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 -tol 1.e-4 > solvers_ij.out.sysh
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 -tol 1.e-4 > solvers_ij.out.sysn
 
 #LGMRS and FlexGMRES
-mpirun -np 2 ./ij -solver 50 -rhsrand > solvers.out.101
-mpirun -np 2 ./ij -solver 51 -rhsrand > solvers.out.102
-mpirun -np 2 ./ij -solver 60 -rhsrand > solvers.out.103
-mpirun -np 2 ./ij -solver 61 -rhsrand > solvers.out.104
+mpirun -np 2 ./ij -solver 50 -tol 1.e-4 -rhsrand > solvers_ij.out.101
+mpirun -np 2 ./ij -solver 51 -tol 1.e-4 -rhsrand > solvers_ij.out.102
+mpirun -np 2 ./ij -solver 60 -tol 1.e-4 -rhsrand > solvers_ij.out.103
+mpirun -np 2 ./ij -solver 61 -tol 1.e-4 -rhsrand > solvers_ij.out.104
 
 #agglomerated coarse grid solve
-mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -tol 1.e-4 > solvers_ij.out.105
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -tol 1.e-4 > solvers_ij.out.107
 
 #redundant coarse grid solve
-mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 -tol 1.e-4 > solvers_ij.out.106
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 -tol 1.e-4 > solvers_ij.out.108
 
 #additive cycles
-mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109
-mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 > solvers.out.110
-mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 -tol 1.e-4 > solvers_ij.out.109
+mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 -tol 1.e-4 > solvers_ij.out.110
+mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 -tol 1.e-4 > solvers_ij.out.111
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 -tol 1.e-4 > solvers_ij.out.112
+mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -tol 1.e-4 > solvers_ij.out.113
 
 #nonGalerkin version
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 -tol 1.e-4 > solvers_ij.out.114
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 -tol 1.e-4 > solvers_ij.out.115
 
 #RAP options
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 -tol 1.e-4 > solvers_ij.out.116
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 -tol 1.e-4 > solvers_ij.out.117
diff --git a/src/test/TEST_single/solvers_ij.saved b/src/test/TEST_single/solvers_ij.saved
new file mode 100644
index 0000000..0f519bd
--- /dev/null
+++ b/src/test/TEST_single/solvers_ij.saved
@@ -0,0 +1,145 @@
+# Output file: solvers_ij.out.0
+Iterations = 4
+Final Relative Residual Norm = 1.622019e-05
+
+# Output file: solvers_ij.out.1
+Iterations = 26
+Final Relative Residual Norm = 7.193490e-05
+
+# Output file: solvers_ij.out.2
+GMRES Iterations = 4
+Final GMRES Relative Residual Norm = 1.566634e-05
+
+# Output file: solvers_ij.out.3
+GMRES Iterations = 39
+Final GMRES Relative Residual Norm = 9.040770e-05
+
+# Output file: solvers_ij.out.4
+Iterations = 5
+Final Relative Residual Norm = 1.566360e-05
+
+# Output file: solvers_ij.out.5
+Iterations = 103
+Final Relative Residual Norm = 8.784404e-05
+
+# Output file: solvers_ij.out.6
+GMRES Iterations = 15
+Final GMRES Relative Residual Norm = 7.131740e-05
+
+# Output file: solvers_ij.out.7
+Iterations = 13
+Final Relative Residual Norm = 7.750608e-05
+
+# Output file: solvers_ij.out.8
+Iterations = 26
+PCG_Iterations = 0
+DSCG_Iterations = 26
+Final Relative Residual Norm = 7.193490e-05
+
+# Output file: solvers_ij.out.9
+Iterations = 8
+PCG_Iterations = 4
+DSCG_Iterations = 4
+Final Relative Residual Norm = 1.866110e-05
+
+# Output file: solvers_ij.out.10
+Iterations = 6
+PCG_Iterations = 4
+DSCG_Iterations = 2
+Final Relative Residual Norm = 5.408481e-05
+
+# Output file: solvers_ij.out.11
+Iterations = 5
+PCG_Iterations = 2
+DSCG_Iterations = 3
+Final Relative Residual Norm = 2.924414e-05
+
+# Output file: solvers_ij.out.sysh
+ Average Convergence Factor = 0.122042
+
+     Complexity:    grid = 1.613750
+                operator = 2.860298
+                   cycle = 5.720429
+
+# Output file: solvers_ij.out.sysn
+ Average Convergence Factor = 0.241065
+
+     Complexity:    grid = 1.592000
+                operator = 2.633619
+                   cycle = 11.267164
+
+# Output file: solvers_ij.out.sysu
+ Average Convergence Factor = 0.413666
+
+     Complexity:    grid = 1.614812
+                operator = 2.865126
+                   cycle = 5.730247
+
+# Output file: solvers_ij.out.101
+LGMRES Iterations = 39
+Final LGMRES Relative Residual Norm = 7.223550e-05
+
+# Output file: solvers_ij.out.102
+LGMRES Iterations = 4
+Final LGMRES Relative Residual Norm = 1.566634e-05
+
+# Output file: solvers_ij.out.103
+FlexGMRES Iterations = 39
+Final FlexGMRES Relative Residual Norm = 9.040783e-05
+
+# Output file: solvers_ij.out.104
+FlexGMRES Iterations = 4
+Final FlexGMRES Relative Residual Norm = 1.567694e-05
+
+# Output file: solvers_ij.out.105
+Iterations = 9
+Final Relative Residual Norm = 2.436101e-05
+
+# Output file: solvers_ij.out.106
+Iterations = 9
+Final Relative Residual Norm = 2.436101e-05
+
+# Output file: solvers_ij.out.107
+Iterations = 12
+Final Relative Residual Norm = 6.371664e-05
+
+# Output file: solvers_ij.out.108
+Iterations = 12
+Final Relative Residual Norm = 6.371664e-05
+
+# Output file: solvers_ij.out.109
+Iterations = 9
+Final Relative Residual Norm = 8.952725e-05
+
+# Output file: solvers_ij.out.110
+Iterations = 9
+Final Relative Residual Norm = 8.952696e-05
+
+# Output file: solvers_ij.out.111
+Iterations = 15
+Final Relative Residual Norm = 8.953455e-05
+
+# Output file: solvers_ij.out.112
+GMRES Iterations = 12
+Final GMRES Relative Residual Norm = 5.915735e-05
+
+# Output file: solvers_ij.out.113
+GMRES Iterations = 13
+Final GMRES Relative Residual Norm = 7.913169e-05
+
+# Output file: solvers_ij.out.114
+BoomerAMG Iterations = 9
+Final Relative Residual Norm = 3.746825e-05
+
+# Output file: solvers_ij.out.115
+BoomerAMG Iterations = 9
+Final Relative Residual Norm = 3.886564e-05
+
+# Output file: solvers_ij.out.116
+GMRES Iterations = 6
+Final GMRES Relative Residual Norm = 3.062358e-05
+
+# Output file: solvers_ij.out.117
+GMRES Iterations = 6
+Final GMRES Relative Residual Norm = 3.026567e-05
+
diff --git a/src/test/TEST_ij/solvers.sh b/src/test/TEST_single/solvers_ij.sh
similarity index 92%
copy from src/test/TEST_ij/solvers.sh
copy to src/test/TEST_single/solvers_ij.sh
index c9ac7ec..f024586 100755
--- a/src/test/TEST_ij/solvers.sh
+++ b/src/test/TEST_single/solvers_ij.sh
@@ -25,12 +25,13 @@ fi
 #                    should be the same
 #=============================================================================
 
-tail -17 ${TNAME}.out.109 | head -6 > ${TNAME}.testdata
+tail -17 ${TNAME}.out.109 | head -3 > ${TNAME}.testdata
 
 #=============================================================================
 
-tail -17 ${TNAME}.out.110 | head -6 > ${TNAME}.testdata.temp
-diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2
+tail -17 ${TNAME}.out.110 | head -3 > ${TNAME}.testdata.temp
+#diff ${TNAME}.testdata ${TNAME}.testdata.temp >&2
+../runcheck.sh ${TNAME}.testdata ${TNAME}.testdata.temp 1.e-4 >&2
 
 #=============================================================================
 # compare with baseline case
diff --git a/src/test/TEST_single/solvers_struct.jobs b/src/test/TEST_single/solvers_struct.jobs
new file mode 100755
index 0000000..bcc0c51
--- /dev/null
+++ b/src/test/TEST_single/solvers_struct.jobs
@@ -0,0 +1,63 @@
+#!/bin/sh
+#BHEADER**********************************************************************
+# Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+# This file is part of HYPRE.  See file COPYRIGHT for details.
+#
+# HYPRE is free software; you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License (as published by the Free
+# Software Foundation) version 2.1 dated February 1999.
+#
+# $Revision$
+#EHEADER**********************************************************************
+
+
+
+
+
+
+
+#=============================================================================
+# struct: Run SMG-CG, PFMG-CG, DSCG, CG, and Hybrid.
+#=============================================================================
+
+mpirun -np 3 ./struct -P 1 1 3 -solver 10 -tol 1.0e-4 > solvers_struct.out.0
+mpirun -np 3 ./struct -P 1 3 1 -solver 11 -tol 1.0e-4 > solvers_struct.out.1
+mpirun -np 3 ./struct -P 3 1 1 -solver 17 -tol 1.0e-4 > solvers_struct.out.2
+mpirun -np 1 ./struct -P 1 1 1 -solver 18 -tol 1.0e-4 > solvers_struct.out.3
+mpirun -np 1 ./struct -P 1 1 1 -solver 19 -tol 1.0e-4 > solvers_struct.out.4
+
+
+#=============================================================================
+# Run default case with all available PCG preconditioners (solvers): 
+#    10: SMG (default)
+#    11: PFMG
+#    17: 2-step Jacobi
+#    18: Diagonal scaling
+#    19: none
+#=============================================================================
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 10 -tol 1.e-4 > solvers_struct.out.10.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-4 -pcgitr 0 -seed 1 -vrand 1 > solvers_struct.out.10.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 10 -tol 1.e-4 -pcgitr 0  -seed 1 -vrand 3  > solvers_struct.out.10.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 11 -tol 1.e-4  > solvers_struct.out.11.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-4  -pcgitr 0 -seed 1 -vrand 1  > solvers_struct.out.11.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 11 -tol 1.e-4  -pcgitr 0 -seed 1 -vrand 3  > solvers_struct.out.11.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 17  > solvers_struct.out.17.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.17.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 17 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 3  > solvers_struct.out.17.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 18  > solvers_struct.out.18.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.18.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 18 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 3  > solvers_struct.out.18.lobpcg.3
+
+# PCG run... LOBPCG with one eigenpair .... LOBPCG with 3 eigenpairs
+mpirun -np 2 ./struct -solver 19 -tol 1.e-4  > solvers_struct.out.19.lobpcg
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 1  > solvers_struct.out.19.lobpcg.1
+mpirun -np 2 ./struct -lobpcg -solver 19 -tol 1.e-4  -pcgitr 10 -seed 1 -vrand 3  > solvers_struct.out.19.lobpcg.3
diff --git a/src/test/TEST_single/solvers_struct.saved b/src/test/TEST_single/solvers_struct.saved
new file mode 100644
index 0000000..b4eb63a
--- /dev/null
+++ b/src/test/TEST_single/solvers_struct.saved
@@ -0,0 +1,120 @@
+# Output file: solvers_struct.out.0
+Iterations = 3
+Final Relative Residual Norm = 3.246673e-05
+
+# Output file: solvers_struct.out.1
+Iterations = 6
+Final Relative Residual Norm = 2.055852e-05
+
+# Output file: solvers_struct.out.2
+Iterations = 16
+Final Relative Residual Norm = 5.377689e-05
+
+# Output file: solvers_struct.out.3
+Iterations = 16
+Final Relative Residual Norm = 3.726248e-05
+
+# Output file: solvers_struct.out.4
+Iterations = 16
+Final Relative Residual Norm = 3.718718e-05
+
+# Output file: solvers_struct.out.10.lobpcg
+Iterations = 3
+Final Relative Residual Norm = 6.275783e-06
+
+# Output file: solvers_struct.out.10.lobpcg.1
+Eigenvalue lambda   1.84366211295128e-01
+Residual   2.48430933424970e-05
+
+# Output file: solvers_struct.out.10.lobpcg.3
+Iteration 10 	bsize 2 	maxres   4.35155990999192e-04
+Iteration 11 	bsize 1 	maxres   2.05302669201046e-04
+Iteration 12 	bsize 1 	maxres   8.52039884193800e-05
+
+Eigenvalue lambda   1.84366419911385e-01
+Eigenvalue lambda   2.50882804393768e-01
+Eigenvalue lambda   3.60089868307114e-01
+Residual   7.40827381378040e-05
+Residual   4.10445172747131e-05
+Residual   8.52039884193800e-05
+
+# Output file: solvers_struct.out.11.lobpcg
+Iterations = 6
+Final Relative Residual Norm = 2.112822e-05
+
+# Output file: solvers_struct.out.11.lobpcg.1
+Eigenvalue lambda   1.84366270899773e-01
+Residual   3.14826756948605e-05
+
+# Output file: solvers_struct.out.11.lobpcg.3
+Iteration 11 	bsize 2 	maxres   6.89935637637973e-04
+Iteration 12 	bsize 2 	maxres   2.52041267231107e-04
+Iteration 13 	bsize 1 	maxres   7.04026824678294e-05
+
+Eigenvalue lambda   1.84366539120674e-01
+Eigenvalue lambda   2.50883042812347e-01
+Eigenvalue lambda   3.60092163085938e-01
+Residual   5.59787331440020e-05
+Residual   2.58176805800758e-05
+Residual   7.04026824678294e-05
+
+# Output file: solvers_struct.out.17.lobpcg
+Iterations = 20
+Final Relative Residual Norm = 4.194806e-07
+
+# Output file: solvers_struct.out.17.lobpcg.1
+Eigenvalue lambda   1.84366390109062e-01
+Residual   1.95900083781453e-05
+
+# Output file: solvers_struct.out.17.lobpcg.3
+Iteration 10 	bsize 2 	maxres   3.62457707524300e-04
+Iteration 11 	bsize 1 	maxres   1.69860562891699e-04
+Iteration 12 	bsize 1 	maxres   7.12833571014926e-05
+
+Eigenvalue lambda   1.84366583824158e-01
+Eigenvalue lambda   2.50883996486664e-01
+Eigenvalue lambda   3.60090583562851e-01
+Residual   5.53683385078330e-05
+Residual   3.08582348225173e-05
+Residual   7.12833571014926e-05
+
+# Output file: solvers_struct.out.18.lobpcg
+Iterations = 33
+Final Relative Residual Norm = 8.027236e-07
+
+# Output file: solvers_struct.out.18.lobpcg.1
+Eigenvalue lambda   1.84366077184677e-01
+Residual   4.44860852439888e-05
+
+# Output file: solvers_struct.out.18.lobpcg.3
+Iteration 10 	bsize 2 	maxres   5.79534214921296e-04
+Iteration 11 	bsize 1 	maxres   1.98537352844141e-04
+Iteration 12 	bsize 1 	maxres   9.26745269680396e-05
+
+Eigenvalue lambda   1.84366509318352e-01
+Eigenvalue lambda   2.50894546508789e-01
+Eigenvalue lambda   3.60091388225555e-01
+Residual   9.26745269680396e-05
+Residual   8.93285541678779e-05
+Residual   5.54441285203211e-05
+
+# Output file: solvers_struct.out.19.lobpcg
+Iterations = 25
+Final Relative Residual Norm = 7.712499e-05
+
+# Output file: solvers_struct.out.19.lobpcg.1
+Eigenvalue lambda   1.84366524219513e-01
+Residual   4.43533899670001e-05
+
+# Output file: solvers_struct.out.19.lobpcg.3
+Iteration 10 	bsize 2 	maxres   5.82181091886014e-04
+Iteration 11 	bsize 1 	maxres   1.99127141968347e-04
+Iteration 12 	bsize 1 	maxres   9.26545544643886e-05
+
+Eigenvalue lambda   1.84366345405579e-01
+Eigenvalue lambda   2.50889092683792e-01
+Eigenvalue lambda   3.60089659690857e-01
+Residual   9.26545544643886e-05
+Residual   8.75688710948452e-05
+Residual   5.51430639461614e-05
+
diff --git a/src/test/TEST_ij/smoother.sh b/src/test/TEST_single/solvers_struct.sh
similarity index 77%
copy from src/test/TEST_ij/smoother.sh
copy to src/test/TEST_single/solvers_struct.sh
index 35b3fe5..711461a 100755
--- a/src/test/TEST_ij/smoother.sh
+++ b/src/test/TEST_single/solvers_struct.sh
@@ -11,8 +11,19 @@
 # $Revision$
 #EHEADER**********************************************************************
 
+
+
+
+
 TNAME=`basename $0 .sh`
+CONVTOL=$1
 
+# Set default check tolerance
+if [ x$CONVTOL = "x" ];
+then
+    CONVTOL=0.0
+fi
+#echo "tol = $CONVTOL"
 #=============================================================================
 # compare with baseline case
 #=============================================================================
@@ -22,37 +33,35 @@ FILES="\
  ${TNAME}.out.1\
  ${TNAME}.out.2\
  ${TNAME}.out.3\
+ ${TNAME}.out.4\
 "
 
 for i in $FILES
 do
   echo "# Output file: $i"
-  tail -21 $i | head -6
+  tail -3 $i
 done > ${TNAME}.out
 
 FILES="\
- ${TNAME}.out.4\
- ${TNAME}.out.5\
- ${TNAME}.out.6\
- ${TNAME}.out.7\
- ${TNAME}.out.8\
- ${TNAME}.out.9\
- ${TNAME}.out.10\
- ${TNAME}.out.11\
- ${TNAME}.out.12\
- ${TNAME}.out.13\
- ${TNAME}.out.14\
- ${TNAME}.out.15\
+ ${TNAME}.out.10.lobpcg\
+ ${TNAME}.out.11.lobpcg\
+ ${TNAME}.out.17.lobpcg\
+ ${TNAME}.out.18.lobpcg\
+ ${TNAME}.out.19.lobpcg\
 "
 
 for i in $FILES
 do
   echo "# Output file: $i"
   tail -3 $i
+  echo "# Output file: $i.1"
+  tail -13 $i.1 | head -3
+  echo "# Output file: $i.3"
+  tail -21 $i.3 | head -11
 done >> ${TNAME}.out
 
 # Make sure that the output files are reasonable
-CHECK_LINE="Complexity"
+CHECK_LINE="Iterations"
 OUT_COUNT=`grep "$CHECK_LINE" ${TNAME}.out | wc -l`
 SAVED_COUNT=`grep "$CHECK_LINE" ${TNAME}.saved | wc -l`
 if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
@@ -60,7 +69,7 @@ if [ "$OUT_COUNT" != "$SAVED_COUNT" ]; then
 fi
 
 if [ -z $HYPRE_NO_SAVED ]; then
-   diff -U3 -bI"time" ${TNAME}.saved ${TNAME}.out >&2
+   (../runcheck.sh ${TNAME}.out ${TNAME}.saved $CONVTOL) >&2
 fi
 
 #=============================================================================
diff --git a/src/test/TEST_sstruct/solvers.saved b/src/test/TEST_sstruct/solvers.saved
index 63f68a8..f59cd25 100644
--- a/src/test/TEST_sstruct/solvers.saved
+++ b/src/test/TEST_sstruct/solvers.saved
@@ -87,70 +87,70 @@ Iterations = 24
 Final Relative Residual Norm = 6.429522e-07
 
 # Output file: solvers.out.10.lobpcg.1
-Eigenvalue lambda   1.34883860790786e+00
-Residual   2.40758202827198e-03
+Eigenvalue lambda   1.34880853089918e+00
+Residual   1.40515754419491e-04
 
 # Output file: solvers.out.10.lobpcg.4
-Eigenvalue lambda   1.34880848246868e+00
-Eigenvalue lambda   1.36560847743331e+00
-Eigenvalue lambda   1.47908699641224e+00
-Eigenvalue lambda   1.49589925871990e+00
-Residual   2.16895015738541e-06
-Residual   2.26924940977357e-06
-Residual   2.30461087038239e-06
-Residual   2.31349855264711e-06
+Eigenvalue lambda   1.34880848246852e+00
+Eigenvalue lambda   1.36560847743334e+00
+Eigenvalue lambda   1.47908699641636e+00
+Eigenvalue lambda   1.49589925873578e+00
+Residual   1.88411068757809e-06
+Residual   1.92352730596892e-06
+Residual   2.62402461132032e-06
+Residual   5.33772292781341e-06
 
 # Output file: solvers.out.11.lobpcg
 Iterations = 24
 Final Relative Residual Norm = 6.654613e-07
 
 # Output file: solvers.out.11.lobpcg.1
-Eigenvalue lambda   1.34884333064988e+00
-Residual   2.59663483715351e-03
+Eigenvalue lambda   1.34880853049745e+00
+Residual   1.27609473406123e-04
 
 # Output file: solvers.out.11.lobpcg.4
-Eigenvalue lambda   1.34880848246851e+00
-Eigenvalue lambda   1.36560847743324e+00
-Eigenvalue lambda   1.47908699641209e+00
-Eigenvalue lambda   1.49589925871908e+00
-Residual   1.80601168111517e-06
-Residual   2.08224805414380e-06
-Residual   2.43864085668235e-06
-Residual   2.07089097060174e-06
+Eigenvalue lambda   1.34880848246853e+00
+Eigenvalue lambda   1.36560847743866e+00
+Eigenvalue lambda   1.47908699650267e+00
+Eigenvalue lambda   1.49589925889085e+00
+Residual   2.34138605050087e-06
+Residual   2.11050112570845e-06
+Residual   2.34273161452232e-06
+Residual   4.55133814558691e-06
 
 # Output file: solvers.out.18.lobpcg
 Iterations = 25
 Final Relative Residual Norm = 9.124482e-07
 
 # Output file: solvers.out.18.lobpcg.1
-Eigenvalue lambda   1.34880848247360e+00
-Residual   1.27710890197410e-06
+Eigenvalue lambda   1.34880848247421e+00
+Residual   1.68760664468992e-06
 
 # Output file: solvers.out.18.lobpcg.4
-Eigenvalue lambda   1.34880848246838e+00
-Eigenvalue lambda   1.36560847743320e+00
-Eigenvalue lambda   1.47908699641198e+00
-Eigenvalue lambda   1.49589925871866e+00
-Residual   1.35084619640744e-06
-Residual   1.98965928996447e-06
-Residual   1.13053053095074e-06
-Residual   2.44142594272676e-06
+Eigenvalue lambda   1.34880848246833e+00
+Eigenvalue lambda   1.36560847743301e+00
+Eigenvalue lambda   1.47908699641137e+00
+Eigenvalue lambda   1.49589925871590e+00
+Residual   7.29467860779249e-07
+Residual   9.62135023675930e-07
+Residual   1.05238502112812e-06
+Residual   1.42836272383011e-06
 
 # Output file: solvers.out.19.lobpcg
 Iterations = 25
 Final Relative Residual Norm = 9.124482e-07
 
 # Output file: solvers.out.19.lobpcg.1
-Eigenvalue lambda   1.34880848247360e+00
-Residual   1.27710890239225e-06
+Eigenvalue lambda   1.34880848247421e+00
+Residual   1.68760664640186e-06
 
 # Output file: solvers.out.19.lobpcg.4
-Eigenvalue lambda   1.34880848246840e+00
-Eigenvalue lambda   1.36560847743319e+00
-Eigenvalue lambda   1.47908699641203e+00
-Eigenvalue lambda   1.49589925871871e+00
-Residual   1.35084619677939e-06
-Residual   1.98965929201569e-06
-Residual   1.13053053130770e-06
-Residual   2.44142594408303e-06
+Eigenvalue lambda   1.34880848246915e+00
+Eigenvalue lambda   1.36560847743475e+00
+Eigenvalue lambda   1.47908699641975e+00
+Eigenvalue lambda   1.49589925875392e+00
+Residual   7.29467866482847e-07
+Residual   9.62135024900930e-07
+Residual   1.05238500241244e-06
+Residual   1.42836275555007e-06
 
diff --git a/src/test/TEST_struct/solvers.saved b/src/test/TEST_struct/solvers.saved
index b474e66..c7203e6 100644
--- a/src/test/TEST_struct/solvers.saved
+++ b/src/test/TEST_struct/solvers.saved
@@ -23,98 +23,98 @@ Iterations = 4
 Final Relative Residual Norm = 1.355288e-07
 
 # Output file: solvers.out.10.lobpcg.1
-Eigenvalue lambda   1.84366453092285e-01
-Residual   9.48102341087739e-07
+Eigenvalue lambda   1.84366453091860e-01
+Residual   4.52048636947317e-07
 
 # Output file: solvers.out.10.lobpcg.5
-Eigenvalue lambda   1.84366453091761e-01
-Eigenvalue lambda   2.50882493969895e-01
-Eigenvalue lambda   3.60090369737266e-01
-Eigenvalue lambda   4.20845334658830e-01
-Eigenvalue lambda   4.20845334659278e-01
-Residual   3.22224736401280e-07
-Residual   4.14402445882080e-07
-Residual   9.42143466466521e-07
-Residual   7.77020993206880e-07
-Residual   7.94159301934379e-07
+Eigenvalue lambda   1.84366453091770e-01
+Eigenvalue lambda   2.50882493969728e-01
+Eigenvalue lambda   3.60090369737200e-01
+Eigenvalue lambda   4.20845334658571e-01
+Eigenvalue lambda   4.20845334658890e-01
+Residual   3.35930189890736e-07
+Residual   3.11260814535858e-07
+Residual   7.33053164875689e-07
+Residual   5.14259223167152e-07
+Residual   7.32161520953280e-07
 
 # Output file: solvers.out.11.lobpcg
 Iterations = 8
 Final Relative Residual Norm = 4.807900e-07
 
 # Output file: solvers.out.11.lobpcg.1
-Eigenvalue lambda   1.84366453091815e-01
-Residual   3.85721107269604e-07
+Eigenvalue lambda   1.84366453092320e-01
+Residual   1.05361158868911e-06
 
 # Output file: solvers.out.11.lobpcg.5
-Eigenvalue lambda   1.84366453091761e-01
-Eigenvalue lambda   2.50882493969760e-01
-Eigenvalue lambda   3.60090369737228e-01
-Eigenvalue lambda   4.20845334658737e-01
-Eigenvalue lambda   4.20845334659947e-01
-Residual   2.05382831978917e-07
-Residual   4.59866546803063e-07
-Residual   9.90818911866409e-07
-Residual   1.33480975037863e-06
-Residual   1.29716729646718e-06
+Eigenvalue lambda   1.84366453091757e-01
+Eigenvalue lambda   2.50882493969849e-01
+Eigenvalue lambda   3.60090369737247e-01
+Eigenvalue lambda   4.20845334660032e-01
+Eigenvalue lambda   4.20845334660337e-01
+Residual   1.94648827543750e-07
+Residual   1.22160192258141e-06
+Residual   7.71834588037176e-07
+Residual   1.28057049961329e-06
+Residual   1.26100802396762e-06
 
 # Output file: solvers.out.17.lobpcg
 Iterations = 17
 Final Relative Residual Norm = 8.241147e-07
 
 # Output file: solvers.out.17.lobpcg.1
-Eigenvalue lambda   1.84366453091981e-01
-Residual   6.58030973127707e-07
+Eigenvalue lambda   1.84366453091822e-01
+Residual   2.74559569275018e-07
 
 # Output file: solvers.out.17.lobpcg.5
-Eigenvalue lambda   1.84366453091754e-01
-Eigenvalue lambda   2.50882493969533e-01
-Eigenvalue lambda   3.60090369737259e-01
-Eigenvalue lambda   4.20845334658560e-01
-Eigenvalue lambda   4.20845334659094e-01
-Residual   2.18600906457648e-07
-Residual   2.01409973718392e-07
-Residual   7.10622693277451e-07
-Residual   8.92153133859236e-07
-Residual   9.10269958132537e-07
+Eigenvalue lambda   1.84366453091761e-01
+Eigenvalue lambda   2.50882493969758e-01
+Eigenvalue lambda   3.60090369737186e-01
+Eigenvalue lambda   4.20845334658611e-01
+Eigenvalue lambda   4.20845334658816e-01
+Residual   2.41984718651313e-07
+Residual   1.16310823945663e-06
+Residual   6.95498018578113e-07
+Residual   7.11395856759138e-07
+Residual   7.20734654889730e-07
 
 # Output file: solvers.out.18.lobpcg
 Iterations = 32
 Final Relative Residual Norm = 8.259590e-07
 
 # Output file: solvers.out.18.lobpcg.1
-Eigenvalue lambda   1.84366453091822e-01
-Residual   2.06395470109285e-07
+Eigenvalue lambda   1.84366453091920e-01
+Residual   4.04811751101852e-07
 
 # Output file: solvers.out.18.lobpcg.5
-Eigenvalue lambda   1.84366453091753e-01
-Eigenvalue lambda   2.50882493969752e-01
-Eigenvalue lambda   3.60090369737251e-01
-Eigenvalue lambda   4.20845334658871e-01
-Eigenvalue lambda   4.20845334659104e-01
-Residual   1.23291603592449e-07
-Residual   4.17547991069515e-07
-Residual   1.17426104064181e-06
-Residual   1.09268026031814e-06
-Residual   9.56432798770891e-07
+Eigenvalue lambda   1.84366453091760e-01
+Eigenvalue lambda   2.50882493969440e-01
+Eigenvalue lambda   3.60090369737180e-01
+Eigenvalue lambda   4.20845334658791e-01
+Eigenvalue lambda   4.20845334659400e-01
+Residual   6.51620151548372e-07
+Residual   1.04113397676966e-06
+Residual   1.05795257133655e-06
+Residual   1.01625954175797e-06
+Residual   1.09288008877697e-06
 
 # Output file: solvers.out.19.lobpcg
 Iterations = 32
 Final Relative Residual Norm = 8.259590e-07
 
 # Output file: solvers.out.19.lobpcg.1
-Eigenvalue lambda   1.84366453091822e-01
-Residual   2.06395470089861e-07
+Eigenvalue lambda   1.84366453091920e-01
+Residual   4.04811751079993e-07
 
 # Output file: solvers.out.19.lobpcg.5
-Eigenvalue lambda   1.84366453091755e-01
-Eigenvalue lambda   2.50882493969748e-01
-Eigenvalue lambda   3.60090369737247e-01
-Eigenvalue lambda   4.20845334658872e-01
-Eigenvalue lambda   4.20845334659103e-01
-Residual   1.23291603639448e-07
-Residual   4.17547991296164e-07
-Residual   1.17426103999361e-06
-Residual   1.09271838882886e-06
-Residual   9.55993938911220e-07
+Eigenvalue lambda   1.84366453091760e-01
+Eigenvalue lambda   2.50882493969706e-01
+Eigenvalue lambda   3.60090369737178e-01
+Eigenvalue lambda   4.20845334658819e-01
+Eigenvalue lambda   4.20845334659484e-01
+Residual   6.51620151941224e-07
+Residual   1.04113396974827e-06
+Residual   1.05795252287838e-06
+Residual   1.01635836602686e-06
+Residual   1.09013903468312e-06
 
diff --git a/src/test/ams_driver.c b/src/test/ams_driver.c
index 76d30b9..8dd2165 100644
--- a/src/test/ams_driver.c
+++ b/src/test/ams_driver.c
@@ -128,7 +128,7 @@ hypre_int main (hypre_int argc, char *argv[])
    hypre_MPI_Init(&argc, &argv);
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs);
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid);
-
+   hypre_GPUInit(-1);
    /* Set defaults */
    solver_id = 3;
    maxit = 100;
@@ -752,7 +752,7 @@ hypre_int main (hypre_int argc, char *argv[])
 
    if (zero_cond)
       HYPRE_ParVectorDestroy(interior_nodes);
-
+   hypre_GPUFinalize();
    hypre_MPI_Finalize();
 
    if (HYPRE_GetError() && !myid)
diff --git a/src/test/for_maxwell.c b/src/test/for_maxwell.c
index 2afd87b..05e4d85 100644
--- a/src/test/for_maxwell.c
+++ b/src/test/for_maxwell.c
@@ -1935,7 +1935,7 @@ main( HYPRE_Int   argc,
    for (j = 0; j < data.max_boxsize; j++)
    {
       values[j]= sin((HYPRE_Real)(j+1));
-      values[j]= (HYPRE_Real) rand()/RAND_MAX;
+      values[j]= (HYPRE_Real) hypre_Rand();
       values[j]= (HYPRE_Real) j;
    }
    for (part = 0; part < data.nparts; part++)
diff --git a/src/test/ij.c b/src/test/ij.c
index 76de9da..8bcdf76 100644
--- a/src/test/ij.c
+++ b/src/test/ij.c
@@ -44,6 +44,9 @@
 #include "multivector.h"
 #include "HYPRE_MatvecFunctions.h"
 
+/* max dt */
+#define DT_INF 1.0e30
+
 HYPRE_Int
 BuildParIsoLaplacian( HYPRE_Int argc, char** argv, HYPRE_ParCSRMatrix *A_ptr );
 
@@ -58,7 +61,7 @@ HYPRE_Int BuildParRhsFromFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_ind
 
 HYPRE_Int BuildParLaplacian (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr );
 HYPRE_Int BuildParSysLaplacian (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr );
-HYPRE_Int BuildParDifConv (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr );
+HYPRE_Int BuildParDifConv (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix *A_ptr);
 HYPRE_Int BuildParFromOneFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_Int num_functions , HYPRE_ParCSRMatrix *A_ptr );
 HYPRE_Int BuildFuncsFromFiles (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix A , HYPRE_Int **dof_func_ptr );
 HYPRE_Int BuildFuncsFromOneFile (HYPRE_Int argc , char *argv [], HYPRE_Int arg_index , HYPRE_ParCSRMatrix A , HYPRE_Int **dof_func_ptr );
@@ -132,7 +135,7 @@ main( hypre_int argc,
    HYPRE_IJVector      *ij_rbm;
 
    HYPRE_ParCSRMatrix  parcsr_A;
-   HYPRE_ParVector     b;
+   HYPRE_ParVector     b = NULL;
    HYPRE_ParVector     x;
    HYPRE_ParVector     *interp_vecs = NULL;
 
@@ -173,7 +176,7 @@ main( hypre_int argc,
    HYPRE_Int Q_max = 0;
    HYPRE_Real Q_trunc = 0;
 
-   const HYPRE_Real dt_inf = 1.e40;
+   const HYPRE_Real dt_inf = DT_INF;
    HYPRE_Real dt = dt_inf;
 
    /* parameters for BoomerAMG */
@@ -213,6 +216,7 @@ main( hypre_int argc,
    HYPRE_Int additive = -1;
    HYPRE_Int mult_add = -1;
    HYPRE_Int simple = -1;
+   HYPRE_Int add_last_lvl = -1;
    HYPRE_Int add_P_max_elmts = 0;
    HYPRE_Real add_trunc_factor = 0;
 
@@ -228,6 +232,9 @@ main( hypre_int argc,
    HYPRE_Real   max_row_sum = 1.;
 
    HYPRE_Int cheby_order = 2;
+   HYPRE_Int cheby_eig_est = 10;
+   HYPRE_Int cheby_variant = 0;
+   HYPRE_Int cheby_scale = 1;
    HYPRE_Real cheby_fraction = .3;
 
    /* for CGC BM Aug 25, 2006 */
@@ -342,6 +349,8 @@ main( hypre_int argc,
 
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+   hypre_GPUInit(-1);
+   //nvtxDomainHandle_t domain = nvtxDomainCreateA("Domain_A");
 /*
   hypre_InitMemoryDebug(myid);
 */
@@ -1207,6 +1216,21 @@ main( hypre_int argc,
          arg_index++;
          cheby_order = atoi(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-cheby_eig_est") == 0 )
+      {
+         arg_index++;
+         cheby_eig_est = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-cheby_variant") == 0 )
+      {
+         arg_index++;
+         cheby_variant = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-cheby_scale") == 0 )
+      {
+         arg_index++;
+         cheby_scale = atoi(argv[arg_index++]);
+      }
       else if ( strcmp(argv[arg_index], "-cheby_fraction") == 0 )
       {
          arg_index++;
@@ -1227,6 +1251,11 @@ main( hypre_int argc,
          arg_index++;
          simple  = atoi(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-add_end") == 0 )
+      {
+         arg_index++;
+         add_last_lvl  = atoi(argv[arg_index++]);
+      }
       else if ( strcmp(argv[arg_index], "-add_Pmx") == 0 )
       {
          arg_index++;
@@ -1291,289 +1320,296 @@ main( hypre_int argc,
     * Print usage info
     *-----------------------------------------------------------*/
  
-   if ( (print_usage) && (myid == 0) )
+   if ( print_usage )
    {
-      hypre_printf("\n");
-      hypre_printf("Usage: %s [<options>]\n", argv[0]);
-      hypre_printf("\n");
-      hypre_printf("  -fromfile <filename>       : ");
-      hypre_printf("matrix read from multiple files (IJ format)\n");
-      hypre_printf("  -fromparcsrfile <filename> : ");
-      hypre_printf("matrix read from multiple files (ParCSR format)\n");
-      hypre_printf("  -fromonecsrfile <filename> : ");
-      hypre_printf("matrix read from a single file (CSR format)\n");
-      hypre_printf("\n");
-      hypre_printf("  -laplacian [<options>] : build 5pt 2D laplacian problem (default) \n");
-      hypre_printf("  -sysL <num functions>  : build SYSTEMS laplacian 7pt operator\n");
-      hypre_printf("  -9pt [<opts>]          : build 9pt 2D laplacian problem\n");
-      hypre_printf("  -27pt [<opts>]         : build 27pt 3D laplacian problem\n");
-      hypre_printf("  -difconv [<opts>]      : build convection-diffusion problem\n");
-      hypre_printf("    -n <nx> <ny> <nz>    : total problem size \n");
-      hypre_printf("    -P <Px> <Py> <Pz>    : processor topology\n");
-      hypre_printf("    -c <cx> <cy> <cz>    : diffusion coefficients\n");
-      hypre_printf("    -a <ax> <ay> <az>    : convection coefficients\n");
-      hypre_printf("\n");
-      hypre_printf("  -exact_size            : inserts immediately into ParCSR structure\n");
-      hypre_printf("  -storage_low           : allocates not enough storage for aux struct\n");
-      hypre_printf("  -concrete_parcsr       : use parcsr matrix type as concrete type\n");
-      hypre_printf("\n");
-      hypre_printf("  -rhsfromfile           : ");
-      hypre_printf("rhs read from multiple files (IJ format)\n");
-      hypre_printf("  -rhsfromonefile        : ");
-      hypre_printf("rhs read from a single file (CSR format)\n");
-      hypre_printf("  -rhsparcsrfile        :  ");
-      hypre_printf("rhs read from multiple files (ParCSR format)\n");
-      hypre_printf("  -rhsrand               : rhs is random vector\n");
-      hypre_printf("  -rhsisone              : rhs is vector with unit components (default)\n");
-      hypre_printf("  -xisone                : solution of all ones\n");
-      hypre_printf("  -rhszero               : rhs is zero vector\n");
-      hypre_printf("\n");
-      hypre_printf("  -dt <val>              : specify finite backward Euler time step\n");
-      hypre_printf("                         :    -rhsfromfile, -rhsfromonefile, -rhsrand,\n");
-      hypre_printf("                         :    -rhsrand, or -xisone will be ignored\n");
-      hypre_printf("  -srcfromfile           : ");
-      hypre_printf("backward Euler source read from multiple files (IJ format)\n");
-      hypre_printf("  -srcfromonefile        : ");
-      hypre_printf("backward Euler source read from a single file (IJ format)\n");
-      hypre_printf("  -srcrand               : ");
-      hypre_printf("backward Euler source is random vector with components in range 0 - 1\n");
-      hypre_printf("  -srcisone              : ");
-      hypre_printf("backward Euler source is vector with unit components (default)\n");
-      hypre_printf("  -srczero               : ");
-      hypre_printf("backward Euler source is zero-vector\n");
-      hypre_printf("\n");
-      hypre_printf("  -solver <ID>           : solver ID\n");
-      hypre_printf("       0=AMG               1=AMG-PCG        \n");
-      hypre_printf("       2=DS-PCG            3=AMG-GMRES      \n");
-      hypre_printf("       4=DS-GMRES          5=AMG-CGNR       \n");     
-      hypre_printf("       6=DS-CGNR           7=PILUT-GMRES    \n");     
-      hypre_printf("       8=ParaSails-PCG     9=AMG-BiCGSTAB   \n");
-      hypre_printf("       10=DS-BiCGSTAB     11=PILUT-BiCGSTAB \n");
-      hypre_printf("       12=Schwarz-PCG     13=GSMG           \n");     
-      hypre_printf("       14=GSMG-PCG        15=GSMG-GMRES\n");     
-      hypre_printf("       18=ParaSails-GMRES\n");     
-      hypre_printf("       20=Hybrid solver/ DiagScale, AMG \n");
-      hypre_printf("       43=Euclid-PCG      44=Euclid-GMRES   \n");
-      hypre_printf("       45=Euclid-BICGSTAB\n");
-      hypre_printf("       50=DS-LGMRES         51=AMG-LGMRES     \n");
-      hypre_printf("       60=DS-FlexGMRES         61=AMG-FlexGMRES     \n");
-      hypre_printf("\n");
-      hypre_printf("  -cljp                 : CLJP coarsening \n");
-      hypre_printf("  -cljp1                : CLJP coarsening, fixed random \n");
-      hypre_printf("  -cgc                  : CGC coarsening \n");
-      hypre_printf("  -cgce                 : CGC-E coarsening \n");
-      hypre_printf("  -pmis                 : PMIS coarsening \n");
-      hypre_printf("  -pmis1                : PMIS coarsening, fixed random \n");
-      hypre_printf("  -hmis                 : HMIS coarsening (default)\n");
-      hypre_printf("  -ruge                 : Ruge-Stueben coarsening (local)\n");
-      hypre_printf("  -ruge1p               : Ruge-Stueben coarsening 1st pass only(local)\n");
-      hypre_printf("  -ruge3                : third pass on boundary\n");
-      hypre_printf("  -ruge3c               : third pass on boundary, keep c-points\n");
-      hypre_printf("  -falgout              : local Ruge_Stueben followed by CLJP\n");
-      hypre_printf("  -gm                   : use global measures\n");
-      hypre_printf("\n");
-      hypre_printf("  -interptype  <val>    : set interpolation type\n");
-      hypre_printf("       0=Classical modified interpolation  \n");
-      hypre_printf("       1=least squares interpolation (for GSMG only)  \n");
-      hypre_printf("       0=Classical modified interpolation for hyperbolic PDEs \n");
-      hypre_printf("       3=direct interpolation with separation of weights  \n");
-      hypre_printf("       4=multipass interpolation  \n");
-      hypre_printf("       5=multipass interpolation with separation of weights  \n");
-      hypre_printf("       6=extended classical modified interpolation (default) \n");
-      hypre_printf("       7=extended (only if no common C neighbor) interpolation  \n");
-      hypre_printf("       8=standard interpolation  \n");
-      hypre_printf("       9=standard interpolation with separation of weights  \n");
-      hypre_printf("      12=FF interpolation  \n");
-      hypre_printf("      13=FF1 interpolation  \n");
- 
-      hypre_printf("      16=use modified unknown interpolation for a system (w/unknown or hybrid approach) \n");
-      hypre_printf("      17=use non-systems interp = 6 for a system (w/unknown or hybrid approach) \n");
-      hypre_printf("      18=use non-systems interp = 8 for a system (w/unknown or hybrid approach) \n");
-      hypre_printf("      19=use non-systems interp = 0 for a system (w/unknown or hybrid approach) \n");
-      
+      if ( myid == 0 )
+      {
+         hypre_printf("\n");
+         hypre_printf("Usage: %s [<options>]\n", argv[0]);
+         hypre_printf("\n");
+         hypre_printf("  -fromfile <filename>       : ");
+         hypre_printf("matrix read from multiple files (IJ format)\n");
+         hypre_printf("  -fromparcsrfile <filename> : ");
+         hypre_printf("matrix read from multiple files (ParCSR format)\n");
+         hypre_printf("  -fromonecsrfile <filename> : ");
+         hypre_printf("matrix read from a single file (CSR format)\n");
+         hypre_printf("\n");
+         hypre_printf("  -laplacian [<options>] : build 5pt 2D laplacian problem (default) \n");
+         hypre_printf("  -sysL <num functions>  : build SYSTEMS laplacian 7pt operator\n");
+         hypre_printf("  -9pt [<opts>]          : build 9pt 2D laplacian problem\n");
+         hypre_printf("  -27pt [<opts>]         : build 27pt 3D laplacian problem\n");
+         hypre_printf("  -difconv [<opts>]      : build convection-diffusion problem\n");
+         hypre_printf("    -n <nx> <ny> <nz>    : total problem size \n");
+         hypre_printf("    -P <Px> <Py> <Pz>    : processor topology\n");
+         hypre_printf("    -c <cx> <cy> <cz>    : diffusion coefficients\n");
+         hypre_printf("    -a <ax> <ay> <az>    : convection coefficients\n");
+         hypre_printf("    -atype <type>        : FD scheme for convection \n");
+         hypre_printf("           0=Forward (default)       1=Backward\n");
+         hypre_printf("           2=Centered                3=Upwind\n");
+         hypre_printf("\n");
+         hypre_printf("  -exact_size            : inserts immediately into ParCSR structure\n");
+         hypre_printf("  -storage_low           : allocates not enough storage for aux struct\n");
+         hypre_printf("  -concrete_parcsr       : use parcsr matrix type as concrete type\n");
+         hypre_printf("\n");
+         hypre_printf("  -rhsfromfile           : ");
+         hypre_printf("rhs read from multiple files (IJ format)\n");
+         hypre_printf("  -rhsfromonefile        : ");
+         hypre_printf("rhs read from a single file (CSR format)\n");
+         hypre_printf("  -rhsparcsrfile        :  ");
+         hypre_printf("rhs read from multiple files (ParCSR format)\n");
+         hypre_printf("  -rhsrand               : rhs is random vector\n");
+         hypre_printf("  -rhsisone              : rhs is vector with unit components (default)\n");
+         hypre_printf("  -xisone                : solution of all ones\n");
+         hypre_printf("  -rhszero               : rhs is zero vector\n");
+         hypre_printf("\n");
+         hypre_printf("  -dt <val>              : specify finite backward Euler time step\n");
+         hypre_printf("                         :    -rhsfromfile, -rhsfromonefile, -rhsrand,\n");
+         hypre_printf("                         :    -rhsrand, or -xisone will be ignored\n");
+         hypre_printf("  -srcfromfile           : ");
+         hypre_printf("backward Euler source read from multiple files (IJ format)\n");
+         hypre_printf("  -srcfromonefile        : ");
+         hypre_printf("backward Euler source read from a single file (IJ format)\n");
+         hypre_printf("  -srcrand               : ");
+         hypre_printf("backward Euler source is random vector with components in range 0 - 1\n");
+         hypre_printf("  -srcisone              : ");
+         hypre_printf("backward Euler source is vector with unit components (default)\n");
+         hypre_printf("  -srczero               : ");
+         hypre_printf("backward Euler source is zero-vector\n");
+         hypre_printf("\n");
+         hypre_printf("  -solver <ID>           : solver ID\n");
+         hypre_printf("       0=AMG               1=AMG-PCG        \n");
+         hypre_printf("       2=DS-PCG            3=AMG-GMRES      \n");
+         hypre_printf("       4=DS-GMRES          5=AMG-CGNR       \n");     
+         hypre_printf("       6=DS-CGNR           7=PILUT-GMRES    \n");     
+         hypre_printf("       8=ParaSails-PCG     9=AMG-BiCGSTAB   \n");
+         hypre_printf("       10=DS-BiCGSTAB     11=PILUT-BiCGSTAB \n");
+         hypre_printf("       12=Schwarz-PCG     13=GSMG           \n");     
+         hypre_printf("       14=GSMG-PCG        15=GSMG-GMRES\n");     
+         hypre_printf("       18=ParaSails-GMRES\n");     
+         hypre_printf("       20=Hybrid solver/ DiagScale, AMG \n");
+         hypre_printf("       43=Euclid-PCG      44=Euclid-GMRES   \n");
+         hypre_printf("       45=Euclid-BICGSTAB\n");
+         hypre_printf("       50=DS-LGMRES         51=AMG-LGMRES     \n");
+         hypre_printf("       60=DS-FlexGMRES         61=AMG-FlexGMRES     \n");
+         hypre_printf("\n");
+         hypre_printf("  -cljp                 : CLJP coarsening \n");
+         hypre_printf("  -cljp1                : CLJP coarsening, fixed random \n");
+         hypre_printf("  -cgc                  : CGC coarsening \n");
+         hypre_printf("  -cgce                 : CGC-E coarsening \n");
+         hypre_printf("  -pmis                 : PMIS coarsening \n");
+         hypre_printf("  -pmis1                : PMIS coarsening, fixed random \n");
+         hypre_printf("  -hmis                 : HMIS coarsening (default)\n");
+         hypre_printf("  -ruge                 : Ruge-Stueben coarsening (local)\n");
+         hypre_printf("  -ruge1p               : Ruge-Stueben coarsening 1st pass only(local)\n");
+         hypre_printf("  -ruge3                : third pass on boundary\n");
+         hypre_printf("  -ruge3c               : third pass on boundary, keep c-points\n");
+         hypre_printf("  -falgout              : local Ruge_Stueben followed by CLJP\n");
+         hypre_printf("  -gm                   : use global measures\n");
+         hypre_printf("\n");
+         hypre_printf("  -interptype  <val>    : set interpolation type\n");
+         hypre_printf("       0=Classical modified interpolation  \n");
+         hypre_printf("       1=least squares interpolation (for GSMG only)  \n");
+         hypre_printf("       0=Classical modified interpolation for hyperbolic PDEs \n");
+         hypre_printf("       3=direct interpolation with separation of weights  \n");
+         hypre_printf("       4=multipass interpolation  \n");
+         hypre_printf("       5=multipass interpolation with separation of weights  \n");
+         hypre_printf("       6=extended classical modified interpolation (default) \n");
+         hypre_printf("       7=extended (only if no common C neighbor) interpolation  \n");
+         hypre_printf("       8=standard interpolation  \n");
+         hypre_printf("       9=standard interpolation with separation of weights  \n");
+         hypre_printf("      12=FF interpolation  \n");
+         hypre_printf("      13=FF1 interpolation  \n");
+
+         hypre_printf("      16=use modified unknown interpolation for a system (w/unknown or hybrid approach) \n");
+         hypre_printf("      17=use non-systems interp = 6 for a system (w/unknown or hybrid approach) \n");
+         hypre_printf("      18=use non-systems interp = 8 for a system (w/unknown or hybrid approach) \n");
+         hypre_printf("      19=use non-systems interp = 0 for a system (w/unknown or hybrid approach) \n");
+
+
+         hypre_printf("      10=classical block interpolation for nodal systems AMG\n");
+         hypre_printf("      11=classical block interpolation with diagonal blocks for nodal systems AMG\n");
+         hypre_printf("      20=same as 10, but don't add weak connect. to diag \n");
+         hypre_printf("      21=same as 11, but don't add weak connect. to diag \n");
+         hypre_printf("      22=classical block interpolation w/Ruge's variant for nodal systems AMG \n");
+         hypre_printf("      23=same as 22, but use row sums for diag scaling matrices,for nodal systems AMG \n");
+         hypre_printf("      24=direct block interpolation for nodal systems AMG\n");
 
-      hypre_printf("      10=classical block interpolation for nodal systems AMG\n");
-      hypre_printf("      11=classical block interpolation with diagonal blocks for nodal systems AMG\n");
-      hypre_printf("      20=same as 10, but don't add weak connect. to diag \n");
-      hypre_printf("      21=same as 11, but don't add weak connect. to diag \n");
-      hypre_printf("      22=classical block interpolation w/Ruge's variant for nodal systems AMG \n");
-      hypre_printf("      23=same as 22, but use row sums for diag scaling matrices,for nodal systems AMG \n");
-      hypre_printf("      24=direct block interpolation for nodal systems AMG\n");
 
 
-     
-      hypre_printf("\n");
-      hypre_printf("  -rlx  <val>            : relaxation type\n");
-      hypre_printf("       0=Weighted Jacobi  \n");
-      hypre_printf("       1=Gauss-Seidel (very slow!)  \n");
-      hypre_printf("       3=Hybrid Gauss-Seidel  \n");
-      hypre_printf("       4=Hybrid backward Gauss-Seidel  \n");
-      hypre_printf("       6=Hybrid symmetric Gauss-Seidel  \n");
-      hypre_printf("       8= symmetric L1-Gauss-Seidel  \n");
-      hypre_printf("       13= forward L1-Gauss-Seidel  \n");
-      hypre_printf("       14= backward L1-Gauss-Seidel  \n");
-      hypre_printf("       15=CG  \n");
-      hypre_printf("       16=Chebyshev  \n");
-      hypre_printf("       17=FCF-Jacobi  \n");
-      hypre_printf("       18=L1-Jacobi (may be used with -CF) \n");
-      hypre_printf("       9=Gauss elimination (use for coarsest grid only)  \n");
-      hypre_printf("       99=Gauss elimination with pivoting (use for coarsest grid only)  \n");
-      hypre_printf("       20= Nodal Weighted Jacobi (for systems only) \n");
-      hypre_printf("       23= Nodal Hybrid Jacobi/Gauss-Seidel (for systems only) \n");
-      hypre_printf("       26= Nodal Hybrid Symmetric Gauss-Seidel  (for systems only)\n");
-      hypre_printf("       29= Nodal Gauss elimination (use for coarsest grid only)  \n");
-      hypre_printf("  -rlx_coarse  <val>       : set relaxation type for coarsest grid\n");
-      hypre_printf("  -rlx_down    <val>       : set relaxation type for down cycle\n");
-      hypre_printf("  -rlx_up      <val>       : set relaxation type for up cycle\n");
-      hypre_printf("  -cheby_order  <val> : set order (1-4) for Chebyshev poly. smoother (default is 2)\n");
-      hypre_printf("  -cheby_fraction <val> : fraction of the spectrum for Chebyshev poly. smoother (default is .3)\n");
-      hypre_printf("  -nodal  <val>            : nodal system type\n");
-      hypre_printf("       0 = Unknown approach \n");
-      hypre_printf("       1 = Frobenius norm  \n");
-      hypre_printf("       2 = Sum of Abs.value of elements  \n");
-      hypre_printf("       3 = Largest magnitude element (includes its sign)  \n");
-      hypre_printf("       4 = Inf. norm  \n");
-      hypre_printf("       5 = One norm  (note: use with block version only) \n");
-      hypre_printf("       6 = Sum of all elements in block  \n");
-      hypre_printf("  -nodal_diag <val>        :how to treat diag elements\n");
-      hypre_printf("       0 = no special treatment \n");
-      hypre_printf("       1 = make diag = neg.sum of the off_diag  \n");
-      hypre_printf("       2 = make diag = neg. of diag \n");
-      hypre_printf("  -ns <val>              : Use <val> sweeps on each level\n");
-      hypre_printf("                           (default C/F down, F/C up, F/C fine\n");
-      hypre_printf("  -ns_coarse  <val>       : set no. of sweeps for coarsest grid\n");
-      hypre_printf("  -ns_down    <val>       : set no. of sweeps for down cycle\n");
-      hypre_printf("  -ns_up      <val>       : set no. of sweeps for up cycle\n");
-      hypre_printf("\n"); 
-      hypre_printf("  -mu   <val>            : set AMG cycles (1=V, 2=W, etc.)\n"); 
-      hypre_printf("  -th   <val>            : set AMG threshold Theta = val \n");
-      hypre_printf("  -tr   <val>            : set AMG interpolation truncation factor = val \n");
-      hypre_printf("  -Pmx  <val>            : set maximal no. of elmts per row for AMG interpolation (default: 4)\n");
-      hypre_printf("  -jtr  <val>            : set truncation threshold for Jacobi interpolation = val \n");
-      hypre_printf("  -Ssw  <val>            : set S-commpkg-switch = val \n");
-      hypre_printf("  -mxrs <val>            : set AMG maximum row sum threshold for dependency weakening \n");
-      hypre_printf("  -nf <val>              : set number of functions for systems AMG\n");
-      hypre_printf("  -numsamp <val>         : set number of sample vectors for GSMG\n");
-    
-      hypre_printf("  -postinterptype <val>  : invokes <val> no. of Jacobi interpolation steps after main interpolation\n");
-      hypre_printf("\n");
-      hypre_printf("  -cgcitr <val>          : set maximal number of coarsening iterations for CGC\n");
-      hypre_printf("  -solver_type <val>     : sets solver within Hybrid solver\n");
-      hypre_printf("                         : 1  PCG  (default)\n");
-      hypre_printf("                         : 2  GMRES\n");
-      hypre_printf("                         : 3  BiCGSTAB\n");
-     
-      hypre_printf("  -w   <val>             : set Jacobi relax weight = val\n");
-      hypre_printf("  -k   <val>             : dimension Krylov space for GMRES\n");
-      hypre_printf("  -aug   <val>           : number of augmentation vectors for LGMRES (-k indicates total approx space size)\n");
-
-      hypre_printf("  -mxl  <val>            : maximum number of levels (AMG, ParaSAILS)\n");
-      hypre_printf("  -tol  <val>            : set solver convergence tolerance = val\n");
-      hypre_printf("  -atol  <val>           : set solver absolute convergence tolerance = val\n");
-      hypre_printf("  -max_iter  <val>       : set max iterations\n");
-      hypre_printf("  -mg_max_iter  <val>    : set max iterations for mg solvers\n");
-      hypre_printf("  -agg_nl  <val>         : set number of aggressive coarsening levels (default:0)\n");
-      hypre_printf("  -np  <val>             : set number of paths of length 2 for aggr. coarsening\n");
-      hypre_printf("\n");
-      hypre_printf("  -sai_th   <val>        : set ParaSAILS threshold = val \n");
-      hypre_printf("  -sai_filt <val>        : set ParaSAILS filter = val \n");
-      hypre_printf("\n");
-      hypre_printf("  -level   <val>         : set k in ILU(k) for Euclid \n");
-      hypre_printf("  -bj <val>              : enable block Jacobi ILU for Euclid \n");
-      hypre_printf("  -ilut <val>            : set drop tolerance for ILUT in Euclid\n");
-      hypre_printf("                           Note ILUT is sequential only!\n");
-      hypre_printf("  -sparseA <val>         : set drop tolerance in ILU(k) for Euclid \n");
-      hypre_printf("  -rowScale <val>        : enable row scaling in Euclid \n");
-      hypre_printf("\n");  
-      hypre_printf("  -drop_tol  <val>       : set threshold for dropping in PILUT\n");
-      hypre_printf("  -nonzeros_to_keep <val>: number of nonzeros in each row to keep\n");
-      hypre_printf("\n");  
-      hypre_printf("  -iout <val>            : set output flag\n");
-      hypre_printf("       0=no output    1=matrix stats\n"); 
-      hypre_printf("       2=cycle stats  3=matrix & cycle stats\n"); 
-      hypre_printf("\n");  
-      hypre_printf("  -dbg <val>             : set debug flag\n");
-      hypre_printf("       0=no debugging\n       1=internal timing\n       2=interpolation truncation\n       3=more detailed timing in coarsening routine\n");
-      hypre_printf("\n");
-      hypre_printf("  -print                 : print out the system\n");
-      hypre_printf("\n");
-      /* begin lobpcg */
+         hypre_printf("\n");
+         hypre_printf("  -rlx  <val>            : relaxation type\n");
+         hypre_printf("       0=Weighted Jacobi  \n");
+         hypre_printf("       1=Gauss-Seidel (very slow!)  \n");
+         hypre_printf("       3=Hybrid Gauss-Seidel  \n");
+         hypre_printf("       4=Hybrid backward Gauss-Seidel  \n");
+         hypre_printf("       6=Hybrid symmetric Gauss-Seidel  \n");
+         hypre_printf("       8= symmetric L1-Gauss-Seidel  \n");
+         hypre_printf("       13= forward L1-Gauss-Seidel  \n");
+         hypre_printf("       14= backward L1-Gauss-Seidel  \n");
+         hypre_printf("       15=CG  \n");
+         hypre_printf("       16=Chebyshev  \n");
+         hypre_printf("       17=FCF-Jacobi  \n");
+         hypre_printf("       18=L1-Jacobi (may be used with -CF) \n");
+         hypre_printf("       9=Gauss elimination (use for coarsest grid only)  \n");
+         hypre_printf("       99=Gauss elimination with pivoting (use for coarsest grid only)  \n");
+         hypre_printf("       20= Nodal Weighted Jacobi (for systems only) \n");
+         hypre_printf("       23= Nodal Hybrid Jacobi/Gauss-Seidel (for systems only) \n");
+         hypre_printf("       26= Nodal Hybrid Symmetric Gauss-Seidel  (for systems only)\n");
+         hypre_printf("       29= Nodal Gauss elimination (use for coarsest grid only)  \n");
+         hypre_printf("  -rlx_coarse  <val>       : set relaxation type for coarsest grid\n");
+         hypre_printf("  -rlx_down    <val>       : set relaxation type for down cycle\n");
+         hypre_printf("  -rlx_up      <val>       : set relaxation type for up cycle\n");
+         hypre_printf("  -cheby_order  <val> : set order (1-4) for Chebyshev poly. smoother (default is 2)\n");
+         hypre_printf("  -cheby_fraction <val> : fraction of the spectrum for Chebyshev poly. smoother (default is .3)\n");
+         hypre_printf("  -nodal  <val>            : nodal system type\n");
+         hypre_printf("       0 = Unknown approach \n");
+         hypre_printf("       1 = Frobenius norm  \n");
+         hypre_printf("       2 = Sum of Abs.value of elements  \n");
+         hypre_printf("       3 = Largest magnitude element (includes its sign)  \n");
+         hypre_printf("       4 = Inf. norm  \n");
+         hypre_printf("       5 = One norm  (note: use with block version only) \n");
+         hypre_printf("       6 = Sum of all elements in block  \n");
+         hypre_printf("  -nodal_diag <val>        :how to treat diag elements\n");
+         hypre_printf("       0 = no special treatment \n");
+         hypre_printf("       1 = make diag = neg.sum of the off_diag  \n");
+         hypre_printf("       2 = make diag = neg. of diag \n");
+         hypre_printf("  -ns <val>              : Use <val> sweeps on each level\n");
+         hypre_printf("                           (default C/F down, F/C up, F/C fine\n");
+         hypre_printf("  -ns_coarse  <val>       : set no. of sweeps for coarsest grid\n");
+         /*hypre_printf("  -ns_down    <val>       : set no. of sweeps for down cycle\n");
+           hypre_printf("  -ns_up      <val>       : set no. of sweeps for up cycle\n");*/
+         hypre_printf("\n"); 
+         hypre_printf("  -mu   <val>            : set AMG cycles (1=V, 2=W, etc.)\n"); 
+         hypre_printf("  -th   <val>            : set AMG threshold Theta = val \n");
+         hypre_printf("  -tr   <val>            : set AMG interpolation truncation factor = val \n");
+         hypre_printf("  -Pmx  <val>            : set maximal no. of elmts per row for AMG interpolation (default: 4)\n");
+         hypre_printf("  -jtr  <val>            : set truncation threshold for Jacobi interpolation = val \n");
+         hypre_printf("  -Ssw  <val>            : set S-commpkg-switch = val \n");
+         hypre_printf("  -mxrs <val>            : set AMG maximum row sum threshold for dependency weakening \n");
+         hypre_printf("  -nf <val>              : set number of functions for systems AMG\n");
+         hypre_printf("  -numsamp <val>         : set number of sample vectors for GSMG\n");
+
+         hypre_printf("  -postinterptype <val>  : invokes <val> no. of Jacobi interpolation steps after main interpolation\n");
+         hypre_printf("\n");
+         hypre_printf("  -cgcitr <val>          : set maximal number of coarsening iterations for CGC\n");
+         hypre_printf("  -solver_type <val>     : sets solver within Hybrid solver\n");
+         hypre_printf("                         : 1  PCG  (default)\n");
+         hypre_printf("                         : 2  GMRES\n");
+         hypre_printf("                         : 3  BiCGSTAB\n");
+
+         hypre_printf("  -w   <val>             : set Jacobi relax weight = val\n");
+         hypre_printf("  -k   <val>             : dimension Krylov space for GMRES\n");
+         hypre_printf("  -aug   <val>           : number of augmentation vectors for LGMRES (-k indicates total approx space size)\n");
+
+         hypre_printf("  -mxl  <val>            : maximum number of levels (AMG, ParaSAILS)\n");
+         hypre_printf("  -tol  <val>            : set solver convergence tolerance = val\n");
+         hypre_printf("  -atol  <val>           : set solver absolute convergence tolerance = val\n");
+         hypre_printf("  -max_iter  <val>       : set max iterations\n");
+         hypre_printf("  -mg_max_iter  <val>    : set max iterations for mg solvers\n");
+         hypre_printf("  -agg_nl  <val>         : set number of aggressive coarsening levels (default:0)\n");
+         hypre_printf("  -np  <val>             : set number of paths of length 2 for aggr. coarsening\n");
+         hypre_printf("\n");
+         hypre_printf("  -sai_th   <val>        : set ParaSAILS threshold = val \n");
+         hypre_printf("  -sai_filt <val>        : set ParaSAILS filter = val \n");
+         hypre_printf("\n");
+         hypre_printf("  -level   <val>         : set k in ILU(k) for Euclid \n");
+         hypre_printf("  -bj <val>              : enable block Jacobi ILU for Euclid \n");
+         hypre_printf("  -ilut <val>            : set drop tolerance for ILUT in Euclid\n");
+         hypre_printf("                           Note ILUT is sequential only!\n");
+         hypre_printf("  -sparseA <val>         : set drop tolerance in ILU(k) for Euclid \n");
+         hypre_printf("  -rowScale <val>        : enable row scaling in Euclid \n");
+         hypre_printf("\n");  
+         hypre_printf("  -drop_tol  <val>       : set threshold for dropping in PILUT\n");
+         hypre_printf("  -nonzeros_to_keep <val>: number of nonzeros in each row to keep\n");
+         hypre_printf("\n");  
+         hypre_printf("  -iout <val>            : set output flag\n");
+         hypre_printf("       0=no output    1=matrix stats\n"); 
+         hypre_printf("       2=cycle stats  3=matrix & cycle stats\n"); 
+         hypre_printf("\n");  
+         hypre_printf("  -dbg <val>             : set debug flag\n");
+         hypre_printf("       0=no debugging\n       1=internal timing\n       2=interpolation truncation\n       3=more detailed timing in coarsening routine\n");
+         hypre_printf("\n");
+         hypre_printf("  -print                 : print out the system\n");
+         hypre_printf("\n");
+         /* begin lobpcg */
 
-      hypre_printf("LOBPCG options:\n");
-      hypre_printf("\n");
-      hypre_printf("  -lobpcg                 : run LOBPCG instead of PCG\n");
-      hypre_printf("\n");
-      hypre_printf("  -gen                    : solve generalized EVP with B = Laplacian\n");
-      hypre_printf("\n");
-      hypre_printf("  -con                    : solve constrained EVP using 'vectors.*.*'\n");
-      hypre_printf("                            as constraints (see -vout 1 below)\n");
-      hypre_printf("\n");
-      hypre_printf("  -solver none            : no HYPRE preconditioner is used\n");
-      hypre_printf("\n");
-      hypre_printf("  -itr <val>              : maximal number of LOBPCG iterations\n");
-      hypre_printf("                            (default 100);\n");
-      hypre_printf("\n");
-      hypre_printf("  -vrand <val>            : compute <val> eigenpairs using random\n");
-      hypre_printf("                            initial vectors (default 1)\n");
-      hypre_printf("\n");
-      hypre_printf("  -seed <val>             : use <val> as the seed for the random\n");
-      hypre_printf("                            number generator(default seed is based\n");
-      hypre_printf("                            on the time of the run)\n");
-      hypre_printf("\n");
-      hypre_printf("  -vfromfile              : read initial vectors from files\n");
-      hypre_printf("                            vectors.i.j where i is vector number\n");
-      hypre_printf("                            and j is processor number\n");
-      hypre_printf("\n");
-      hypre_printf("  -orthchk                : check eigenvectors for orthonormality\n");
-      hypre_printf("\n");
-      hypre_printf("  -verb <val>             : verbosity level\n");
-      hypre_printf("  -verb 0                 : no print\n");
-      hypre_printf("  -verb 1                 : print initial eigenvalues and residuals,\n");
-      hypre_printf("                            the iteration number, the number of\n");
-      hypre_printf("                            non-convergent eigenpairs and final\n");
-      hypre_printf("                            eigenvalues and residuals (default)\n");
-      hypre_printf("  -verb 2                 : print eigenvalues and residuals on each\n");
-      hypre_printf("                            iteration\n");
-      hypre_printf("\n");
-      hypre_printf("  -pcgitr <val>           : maximal number of inner PCG iterations\n");
-      hypre_printf("                            for preconditioning (default 1);\n");
-      hypre_printf("                            if <val> = 0 then the preconditioner\n");
-      hypre_printf("                            is applied directly\n");
-      hypre_printf("\n");
-      hypre_printf("  -pcgtol <val>           : residual tolerance for inner iterations\n");
-      hypre_printf("                            (default 0.01)\n");
-      hypre_printf("\n");
-      hypre_printf("  -vout <val>             : file output level\n");
-      hypre_printf("  -vout 0                 : no files created (default)\n");
-      hypre_printf("  -vout 1                 : write eigenvalues to values.txt, residuals\n");
-      hypre_printf("                            to residuals.txt and eigenvectors to \n");
-      hypre_printf("                            vectors.i.j where i is vector number\n");
-      hypre_printf("                            and j is processor number\n");
-      hypre_printf("  -vout 2                 : in addition to the above, write the\n");
-      hypre_printf("                            eigenvalues history (the matrix whose\n");
-      hypre_printf("                            i-th column contains eigenvalues at\n");
-      hypre_printf("                            (i+1)-th iteration) to val_hist.txt and\n");
-      hypre_printf("                            residuals history to res_hist.txt\n");
-      hypre_printf("\nNOTE: in this test driver LOBPCG only works with solvers 1, 2, 8, 12, 14 and 43\n");
-      hypre_printf("\ndefault solver is 1\n");
-      hypre_printf("\n");
+         hypre_printf("LOBPCG options:\n");
+         hypre_printf("\n");
+         hypre_printf("  -lobpcg                 : run LOBPCG instead of PCG\n");
+         hypre_printf("\n");
+         hypre_printf("  -gen                    : solve generalized EVP with B = Laplacian\n");
+         hypre_printf("\n");
+         hypre_printf("  -con                    : solve constrained EVP using 'vectors.*.*'\n");
+         hypre_printf("                            as constraints (see -vout 1 below)\n");
+         hypre_printf("\n");
+         hypre_printf("  -solver none            : no HYPRE preconditioner is used\n");
+         hypre_printf("\n");
+         hypre_printf("  -itr <val>              : maximal number of LOBPCG iterations\n");
+         hypre_printf("                            (default 100);\n");
+         hypre_printf("\n");
+         hypre_printf("  -vrand <val>            : compute <val> eigenpairs using random\n");
+         hypre_printf("                            initial vectors (default 1)\n");
+         hypre_printf("\n");
+         hypre_printf("  -seed <val>             : use <val> as the seed for the random\n");
+         hypre_printf("                            number generator(default seed is based\n");
+         hypre_printf("                            on the time of the run)\n");
+         hypre_printf("\n");
+         hypre_printf("  -vfromfile              : read initial vectors from files\n");
+         hypre_printf("                            vectors.i.j where i is vector number\n");
+         hypre_printf("                            and j is processor number\n");
+         hypre_printf("\n");
+         hypre_printf("  -orthchk                : check eigenvectors for orthonormality\n");
+         hypre_printf("\n");
+         hypre_printf("  -verb <val>             : verbosity level\n");
+         hypre_printf("  -verb 0                 : no print\n");
+         hypre_printf("  -verb 1                 : print initial eigenvalues and residuals,\n");
+         hypre_printf("                            the iteration number, the number of\n");
+         hypre_printf("                            non-convergent eigenpairs and final\n");
+         hypre_printf("                            eigenvalues and residuals (default)\n");
+         hypre_printf("  -verb 2                 : print eigenvalues and residuals on each\n");
+         hypre_printf("                            iteration\n");
+         hypre_printf("\n");
+         hypre_printf("  -pcgitr <val>           : maximal number of inner PCG iterations\n");
+         hypre_printf("                            for preconditioning (default 1);\n");
+         hypre_printf("                            if <val> = 0 then the preconditioner\n");
+         hypre_printf("                            is applied directly\n");
+         hypre_printf("\n");
+         hypre_printf("  -pcgtol <val>           : residual tolerance for inner iterations\n");
+         hypre_printf("                            (default 0.01)\n");
+         hypre_printf("\n");
+         hypre_printf("  -vout <val>             : file output level\n");
+         hypre_printf("  -vout 0                 : no files created (default)\n");
+         hypre_printf("  -vout 1                 : write eigenvalues to values.txt, residuals\n");
+         hypre_printf("                            to residuals.txt and eigenvectors to \n");
+         hypre_printf("                            vectors.i.j where i is vector number\n");
+         hypre_printf("                            and j is processor number\n");
+         hypre_printf("  -vout 2                 : in addition to the above, write the\n");
+         hypre_printf("                            eigenvalues history (the matrix whose\n");
+         hypre_printf("                            i-th column contains eigenvalues at\n");
+         hypre_printf("                            (i+1)-th iteration) to val_hist.txt and\n");
+         hypre_printf("                            residuals history to res_hist.txt\n");
+         hypre_printf("\nNOTE: in this test driver LOBPCG only works with solvers 1, 2, 8, 12, 14 and 43\n");
+         hypre_printf("\ndefault solver is 1\n");
+         hypre_printf("\n");
 
-      /* end lobpcg */
+         /* end lobpcg */
 
-      hypre_printf("  -plot_grids            : print out information for plotting the grids\n");
-      hypre_printf("  -plot_file_name <val>  : file name for plotting output\n");
-      hypre_printf("\n");
-      hypre_printf("  -smtype <val>      :smooth type\n");
-      hypre_printf("  -smlv <val>        :smooth num levels\n");
-      hypre_printf("  -ov <val>          :over lap:\n");
-      hypre_printf("  -dom <val>         :domain type\n");
-      hypre_printf("  -use_ns            : use non-symm schwarz smoother\n");
-      hypre_printf("  -var <val>         : schwarz smoother variant (0-3) \n");
-      hypre_printf("  -blk_sm <val>      : same as '-smtype 6 -ov 0 -dom 1 -smlv <val>'\n");
-      hypre_printf("  -nongalerk_tol <val> <list>    : specify the NonGalerkin drop tolerance\n");
-      hypre_printf("                                   and list contains the values, where last value\n");
-      hypre_printf("                                   in list is repeated if val < num_levels in AMG\n");
-      exit(1);
+         hypre_printf("  -plot_grids            : print out information for plotting the grids\n");
+         hypre_printf("  -plot_file_name <val>  : file name for plotting output\n");
+         hypre_printf("\n");
+         hypre_printf("  -smtype <val>      :smooth type\n");
+         hypre_printf("  -smlv <val>        :smooth num levels\n");
+         hypre_printf("  -ov <val>          :over lap:\n");
+         hypre_printf("  -dom <val>         :domain type\n");
+         hypre_printf("  -use_ns            : use non-symm schwarz smoother\n");
+         hypre_printf("  -var <val>         : schwarz smoother variant (0-3) \n");
+         hypre_printf("  -blk_sm <val>      : same as '-smtype 6 -ov 0 -dom 1 -smlv <val>'\n");
+         hypre_printf("  -nongalerk_tol <val> <list>    : specify the NonGalerkin drop tolerance\n");
+         hypre_printf("                                   and list contains the values, where last value\n");
+         hypre_printf("                                   in list is repeated if val < num_levels in AMG\n");
+      }
+
+      goto final;
    }
 
    /*-----------------------------------------------------------
@@ -1636,7 +1672,8 @@ main( hypre_int argc,
    else if ( build_matrix_type == 6 )
    {
       BuildParVarDifConv(argc, argv, build_matrix_arg_index, &parcsr_A, &b);
-      /*HYPRE_ParCSRMatrixPrint(parcsr_A,"mat100");*/
+      build_rhs_type      = 6;
+      build_src_type      = 5;
    }
    else if ( build_matrix_type == 7 )
    {
@@ -2163,6 +2200,10 @@ main( hypre_int argc,
       ierr = HYPRE_IJVectorGetObject( ij_x, &object );
       x = (HYPRE_ParVector) object;
    }
+   else if ( build_rhs_type == 6) 
+   {
+      ij_b = NULL;
+   }
 
    if ( build_src_type == 0 )
    {
@@ -2328,6 +2369,33 @@ main( hypre_int argc,
       ierr = HYPRE_IJVectorGetObject( ij_x, &object );
       x = (HYPRE_ParVector) object;
    }
+   else if ( build_src_type == 5 )
+   {
+      if (myid == 0)
+      {
+         hypre_printf("  Initial guess is random \n");
+      }
+
+      /* Initial guess */
+      HYPRE_IJVectorCreate(hypre_MPI_COMM_WORLD, first_local_col, last_local_col, &ij_x);
+      HYPRE_IJVectorSetObjectType(ij_x, HYPRE_PARCSR);
+      HYPRE_IJVectorInitialize(ij_x);
+
+      /* For backward Euler the previous backward Euler iterate (assumed
+         random in 0 - 1 here) is usually used as the initial guess */
+      values = hypre_CTAlloc(HYPRE_Real, local_num_cols);
+      /* hypre_SeedRand(myid+2747); */
+      hypre_SeedRand(myid);
+      for (i = 0; i < local_num_cols; i++)
+      {
+         values[i] = hypre_Rand();
+      }
+      HYPRE_IJVectorSetValues(ij_x, local_num_cols, NULL, values);
+      hypre_TFree(values);
+
+      ierr = HYPRE_IJVectorGetObject( ij_x, &object );
+      x = (HYPRE_ParVector) object;
+   }
 
    hypre_EndTiming(time_index);
    hypre_PrintTiming("IJ Vector Setup", hypre_MPI_COMM_WORLD);
@@ -2359,7 +2427,14 @@ main( hypre_int argc,
    if (print_system)
    {
       HYPRE_IJMatrixPrint(ij_A, "IJ.out.A");
-      HYPRE_IJVectorPrint(ij_b, "IJ.out.b");
+      if (ij_b)
+      {
+         HYPRE_IJVectorPrint(ij_b, "IJ.out.b");
+      }
+      else if (b)
+      {
+         HYPRE_ParVectorPrint(b, "ParVec.out.b");
+      }
       HYPRE_IJVectorPrint(ij_x, "IJ.out.x0");
 
       /* HYPRE_ParCSRMatrixPrint( parcsr_A, "new_mat.A" );*/
@@ -2497,6 +2572,9 @@ main( hypre_int argc,
       HYPRE_BoomerAMGSetAddRelaxWt(amg_solver, add_relax_wt);
       HYPRE_BoomerAMGSetChebyOrder(amg_solver, cheby_order);
       HYPRE_BoomerAMGSetChebyFraction(amg_solver, cheby_fraction);
+      HYPRE_BoomerAMGSetChebyEigEst(amg_solver, cheby_eig_est);
+      HYPRE_BoomerAMGSetChebyVariant(amg_solver, cheby_variant);
+      HYPRE_BoomerAMGSetChebyScale(amg_solver, cheby_scale);
       HYPRE_BoomerAMGSetRelaxOrder(amg_solver, relax_order);
       HYPRE_BoomerAMGSetRelaxWt(amg_solver, relax_wt);
       HYPRE_BoomerAMGSetOuterWt(amg_solver, outer_wt);
@@ -2536,6 +2614,7 @@ main( hypre_int argc,
       HYPRE_BoomerAMGSetAdditive(amg_solver, additive);
       HYPRE_BoomerAMGSetMultAdditive(amg_solver, mult_add);
       HYPRE_BoomerAMGSetSimple(amg_solver, simple);
+      HYPRE_BoomerAMGSetAddLastLvl(amg_solver, add_last_lvl);
       HYPRE_BoomerAMGSetMultAddPMaxElmts(amg_solver, add_P_max_elmts);
       HYPRE_BoomerAMGSetMultAddTruncFactor(amg_solver, add_trunc_factor);
 
@@ -2653,6 +2732,9 @@ main( hypre_int argc,
       HYPRE_BoomerAMGSetAddRelaxWt(amg_solver, add_relax_wt);
       HYPRE_BoomerAMGSetChebyOrder(amg_solver, cheby_order);
       HYPRE_BoomerAMGSetChebyFraction(amg_solver, cheby_fraction);
+      HYPRE_BoomerAMGSetChebyEigEst(amg_solver, cheby_eig_est);
+      HYPRE_BoomerAMGSetChebyVariant(amg_solver, cheby_variant);
+      HYPRE_BoomerAMGSetChebyScale(amg_solver, cheby_scale);
       HYPRE_BoomerAMGSetRelaxOrder(amg_solver, relax_order);
       HYPRE_BoomerAMGSetRelaxWt(amg_solver, relax_wt);
       HYPRE_BoomerAMGSetOuterWt(amg_solver, outer_wt);
@@ -2690,6 +2772,7 @@ main( hypre_int argc,
       HYPRE_BoomerAMGSetAdditive(amg_solver, additive);
       HYPRE_BoomerAMGSetMultAdditive(amg_solver, mult_add);
       HYPRE_BoomerAMGSetSimple(amg_solver, simple);
+      HYPRE_BoomerAMGSetAddLastLvl(amg_solver, add_last_lvl);
       HYPRE_BoomerAMGSetMultAddPMaxElmts(amg_solver, add_P_max_elmts);
       HYPRE_BoomerAMGSetMultAddTruncFactor(amg_solver, add_trunc_factor);
       HYPRE_BoomerAMGSetRAP2(amg_solver, rap2);
@@ -2817,6 +2900,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -2855,6 +2941,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -2972,6 +3059,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
          if (level_w > -1)
@@ -3008,6 +3098,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -3064,7 +3155,7 @@ main( hypre_int argc,
 
       HYPRE_PCGSetup(pcg_solver, (HYPRE_Matrix)parcsr_A, 
                      (HYPRE_Vector)b, (HYPRE_Vector)x);
- 
+
       hypre_EndTiming(time_index);
       hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD);
       hypre_FinalizeTiming(time_index);
@@ -4060,6 +4151,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4098,6 +4192,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4203,6 +4298,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4241,6 +4339,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4423,6 +4522,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4461,6 +4563,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4596,6 +4699,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4634,6 +4740,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4774,6 +4881,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -4813,6 +4923,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -4996,6 +5107,9 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAddRelaxWt(pcg_precond, add_relax_wt);
          HYPRE_BoomerAMGSetChebyOrder(pcg_precond, cheby_order);
          HYPRE_BoomerAMGSetChebyFraction(pcg_precond, cheby_fraction);
+         HYPRE_BoomerAMGSetChebyEigEst(pcg_precond, cheby_eig_est);
+         HYPRE_BoomerAMGSetChebyVariant(pcg_precond, cheby_variant);
+         HYPRE_BoomerAMGSetChebyScale(pcg_precond, cheby_scale);
          HYPRE_BoomerAMGSetRelaxOrder(pcg_precond, relax_order);
          HYPRE_BoomerAMGSetRelaxWt(pcg_precond, relax_wt);
          HYPRE_BoomerAMGSetOuterWt(pcg_precond, outer_wt);
@@ -5027,6 +5141,7 @@ main( hypre_int argc,
          HYPRE_BoomerAMGSetAdditive(pcg_precond, additive);
          HYPRE_BoomerAMGSetMultAdditive(pcg_precond, mult_add);
          HYPRE_BoomerAMGSetSimple(pcg_precond, simple);
+         HYPRE_BoomerAMGSetAddLastLvl(pcg_precond, add_last_lvl);
          HYPRE_BoomerAMGSetMultAddPMaxElmts(pcg_precond, add_P_max_elmts);
          HYPRE_BoomerAMGSetMultAddTruncFactor(pcg_precond, add_trunc_factor);
          HYPRE_BoomerAMGSetRAP2(pcg_precond, rap2);
@@ -5117,8 +5232,8 @@ main( hypre_int argc,
     *-----------------------------------------------------------*/
 
    /* RDF: Why is this here? */
-   if (!(build_rhs_type ==1 || build_rhs_type ==7))
-      HYPRE_IJVectorGetObjectType(ij_b, &j);
+   /*if (!(build_rhs_type ==1 || build_rhs_type ==7))
+      HYPRE_IJVectorGetObjectType(ij_b, &j);*/
 
    if (print_system)
    {
@@ -5133,7 +5248,7 @@ main( hypre_int argc,
    else HYPRE_ParCSRMatrixDestroy(parcsr_A);
 
    /* for build_rhs_type = 1 or 7, we did not create ij_b  - just b*/
-   if (build_rhs_type ==1 || build_rhs_type ==7)
+   if (build_rhs_type ==1 || build_rhs_type ==7 || build_rhs_type==6)
       HYPRE_ParVectorDestroy(b);
    else
       HYPRE_IJVectorDestroy(ij_b);
@@ -5152,7 +5267,9 @@ main( hypre_int argc,
 /*
   hypre_FinalizeMemoryDebug();
 */
+final: 
 
+   hypre_GPUFinalize();
    hypre_MPI_Finalize();
 
    return (0);
@@ -5762,6 +5879,17 @@ BuildParLaplacian( HYPRE_Int                  argc,
 }
 
 /*----------------------------------------------------------------------
+ * returns the sign of a real number
+ *  1 : positive
+ *  0 : zero
+ * -1 : negative
+ *----------------------------------------------------------------------*/
+static inline HYPRE_Int sign_double(HYPRE_Real a)
+{
+   return ( (0.0 < a) - (0.0 > a) );
+}
+
+/*----------------------------------------------------------------------
  * Build standard 7-point convection-diffusion operator 
  * Parameters given in command line.
  * Operator:
@@ -5774,18 +5902,19 @@ HYPRE_Int
 BuildParDifConv( HYPRE_Int                  argc,
                  char                *argv[],
                  HYPRE_Int                  arg_index,
-                 HYPRE_ParCSRMatrix  *A_ptr     )
+                 HYPRE_ParCSRMatrix  *A_ptr)
 {
-   HYPRE_Int                 nx, ny, nz;
-   HYPRE_Int                 P, Q, R;
+   HYPRE_Int           nx, ny, nz;
+   HYPRE_Int           P, Q, R;
    HYPRE_Real          cx, cy, cz;
-   HYPRE_Real          ax, ay, az;
+   HYPRE_Real          ax, ay, az, atype;
    HYPRE_Real          hinx,hiny,hinz;
+   HYPRE_Int           sign_prod;
 
    HYPRE_ParCSRMatrix  A;
 
-   HYPRE_Int                 num_procs, myid;
-   HYPRE_Int                 p, q, r;
+   HYPRE_Int           num_procs, myid;
+   HYPRE_Int           p, q, r;
    HYPRE_Real         *values;
 
    /*-----------------------------------------------------------
@@ -5815,6 +5944,8 @@ BuildParDifConv( HYPRE_Int                  argc,
    ay = 1.;
    az = 1.;
 
+   atype = 0;
+
    /*-----------------------------------------------------------
     * Parse command line
     *-----------------------------------------------------------*/
@@ -5849,6 +5980,11 @@ BuildParDifConv( HYPRE_Int                  argc,
          ay = atof(argv[arg_index++]);
          az = atof(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-atype") == 0 )
+      {
+         arg_index++;
+         atype = atoi(argv[arg_index++]);
+      }
       else
       {
          arg_index++;
@@ -5895,28 +6031,146 @@ BuildParDifConv( HYPRE_Int                  argc,
    /*-----------------------------------------------------------
     * Generate the matrix 
     *-----------------------------------------------------------*/
- 
+   /* values[7]:
+    *    [0]: center
+    *    [1]: X-
+    *    [2]: Y-
+    *    [3]: Z-
+    *    [4]: X+
+    *    [5]: Y+
+    *    [6]: Z+
+    */    
    values = hypre_CTAlloc(HYPRE_Real, 7);
 
-   values[1] = -cx/(hinx*hinx);
-   values[2] = -cy/(hiny*hiny);
-   values[3] = -cz/(hinz*hinz);
-   values[4] = -cx/(hinx*hinx) + ax/hinx;
-   values[5] = -cy/(hiny*hiny) + ay/hiny;
-   values[6] = -cz/(hinz*hinz) + az/hinz;
-
    values[0] = 0.;
-   if (nx > 1)
+
+   if (0 == atype) /* forward scheme for conv */
    {
-      values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx;
+      values[1] = -cx/(hinx*hinx);
+      values[2] = -cy/(hiny*hiny);
+      values[3] = -cz/(hinz*hinz);
+      values[4] = -cx/(hinx*hinx) + ax/hinx;
+      values[5] = -cy/(hiny*hiny) + ay/hiny;
+      values[6] = -cz/(hinz*hinz) + az/hinz;
+
+      if (nx > 1)
+      {
+         values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx;
+      }
+      if (ny > 1)
+      {
+         values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny;
+      }
+      if (nz > 1)
+      {
+         values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz;
+      }
+   } 
+   else if (1 == atype) /* backward scheme for conv */
+   {
+      values[1] = -cx/(hinx*hinx) - ax/hinx;
+      values[2] = -cy/(hiny*hiny) - ay/hiny;
+      values[3] = -cz/(hinz*hinz) - az/hinz;
+      values[4] = -cx/(hinx*hinx);
+      values[5] = -cy/(hiny*hiny);
+      values[6] = -cz/(hinz*hinz);
+
+      if (nx > 1)
+      {
+         values[0] += 2.0*cx/(hinx*hinx) + 1.*ax/hinx;
+      }
+      if (ny > 1)
+      {
+         values[0] += 2.0*cy/(hiny*hiny) + 1.*ay/hiny;
+      }
+      if (nz > 1)
+      {
+         values[0] += 2.0*cz/(hinz*hinz) + 1.*az/hinz;
+      }
    }
-   if (ny > 1)
+   else if (3 == atype) /* upwind scheme */
    {
-      values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny;
+      sign_prod = sign_double(cx) * sign_double(ax);
+      if (sign_prod == 1) /* same sign use back scheme */
+      {
+         values[1] = -cx/(hinx*hinx) - ax/hinx;
+         values[4] = -cx/(hinx*hinx);
+         if (nx > 1)
+         {
+            values[0] += 2.0*cx/(hinx*hinx) + 1.*ax/hinx;
+         }
+      }
+      else /* diff sign use forward scheme */
+      {
+         values[1] = -cx/(hinx*hinx);
+         values[4] = -cx/(hinx*hinx) + ax/hinx;
+         if (nx > 1)
+         {
+            values[0] += 2.0*cx/(hinx*hinx) - 1.*ax/hinx;
+         }
+      }
+
+      sign_prod = sign_double(cy) * sign_double(ay);
+      if (sign_prod == 1) /* same sign use back scheme */
+      {
+         values[2] = -cy/(hiny*hiny) - ay/hiny;
+         values[5] = -cy/(hiny*hiny);
+         if (ny > 1)
+         {
+            values[0] += 2.0*cy/(hiny*hiny) + 1.*ay/hiny;
+         }
+      }
+      else /* diff sign use forward scheme */
+      {
+         values[2] = -cy/(hiny*hiny);
+         values[5] = -cy/(hiny*hiny) + ay/hiny;
+         if (ny > 1)
+         {
+            values[0] += 2.0*cy/(hiny*hiny) - 1.*ay/hiny;
+         }
+      }
+
+      sign_prod = sign_double(cz) * sign_double(az);
+      if (sign_prod == 1) /* same sign use back scheme */
+      {
+         values[3] = -cz/(hinz*hinz) - az/hinz;
+         values[6] = -cz/(hinz*hinz);
+         if (nz > 1)
+         {
+            values[0] += 2.0*cz/(hinz*hinz) + 1.*az/hinz;
+         }
+      }
+      else /* diff sign use forward scheme */
+      {
+         values[3] = -cz/(hinz*hinz);
+         values[6] = -cz/(hinz*hinz) + az/hinz;
+         if (nz > 1)
+         {
+            values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz;
+         }
+      }
    }
-   if (nz > 1)
+   else /* centered difference scheme */
    {
-      values[0] += 2.0*cz/(hinz*hinz) - 1.*az/hinz;
+      values[1] = -cx/(hinx*hinx) - ax/(2.*hinx);
+      values[2] = -cy/(hiny*hiny) - ay/(2.*hiny);
+      values[3] = -cz/(hinz*hinz) - az/(2.*hinz);
+      values[4] = -cx/(hinx*hinx) + ax/(2.*hinx);
+      values[5] = -cy/(hiny*hiny) + ay/(2.*hiny);
+      values[6] = -cz/(hinz*hinz) + az/(2.*hinz);
+
+      if (nx > 1)
+      {
+         values[0] += 2.0*cx/(hinx*hinx);
+      }
+      if (ny > 1)
+      {
+         values[0] += 2.0*cy/(hiny*hiny);
+      }
+      if (nz > 1)
+      {
+         values[0] += 2.0*cz/(hinz*hinz);
+      }
    }
 
    A = (HYPRE_ParCSRMatrix) GenerateDifConv(hypre_MPI_COMM_WORLD,
@@ -6580,8 +6834,9 @@ BuildParVarDifConv( HYPRE_Int                  argc,
    HYPRE_ParCSRMatrix  A;
    HYPRE_ParVector  rhs;
 
-   HYPRE_Int                 num_procs, myid;
-   HYPRE_Int                 p, q, r;
+   HYPRE_Int           num_procs, myid;
+   HYPRE_Int           p, q, r;
+   HYPRE_Int           type;
    HYPRE_Real          eps;
 
    /*-----------------------------------------------------------
@@ -6601,6 +6856,11 @@ BuildParVarDifConv( HYPRE_Int                  argc,
    P  = 1;
    Q  = num_procs;
    R  = 1;
+   eps = 1.0;
+
+   /* type: 0   : default FD;
+    *       1-3 : FD and examples 1-3 in Ruge-Stuben paper */
+   type = 0;
 
    /*-----------------------------------------------------------
     * Parse command line
@@ -6627,6 +6887,11 @@ BuildParVarDifConv( HYPRE_Int                  argc,
          arg_index++;
          eps  = atof(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-vardifconvRS") == 0 )
+      {
+         arg_index++;
+         type = atoi(argv[arg_index++]);
+      }
       else
       {
          arg_index++;
@@ -6667,8 +6932,17 @@ BuildParVarDifConv( HYPRE_Int                  argc,
     * Generate the matrix
     *-----------------------------------------------------------*/
 
-   A = (HYPRE_ParCSRMatrix) GenerateVarDifConv(hypre_MPI_COMM_WORLD,
-                                               nx, ny, nz, P, Q, R, p, q, r, eps, &rhs);
+   if (0 == type)
+   {
+      A = (HYPRE_ParCSRMatrix) GenerateVarDifConv(hypre_MPI_COMM_WORLD,
+                                                  nx, ny, nz, P, Q, R, p, q, r, eps, &rhs);
+   }
+   else
+   {
+      A = (HYPRE_ParCSRMatrix) GenerateRSVarDifConv(hypre_MPI_COMM_WORLD,
+                                                    nx, ny, nz, P, Q, R, p, q, r, eps, &rhs,
+                                                    type);
+   }
 
    *A_ptr = A;
    *rhs_ptr = rhs;
diff --git a/src/test/maxwell_unscaled.c b/src/test/maxwell_unscaled.c
index 9bded3d..66bb0cd 100644
--- a/src/test/maxwell_unscaled.c
+++ b/src/test/maxwell_unscaled.c
@@ -1336,6 +1336,8 @@ main( hypre_int argc,
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs);
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid);
 
+   hypre_GPUInit(-1);
+
    hypre_InitMemoryDebug(myid);
 
    /*-----------------------------------------------------------
@@ -1741,7 +1743,7 @@ main( hypre_int argc,
    for (j = 0; j < data.max_boxsize; j++)
    {
       values[j]= sin((HYPRE_Real)(j+1));
-      values[j]= (HYPRE_Real) rand()/RAND_MAX;
+      values[j]= (HYPRE_Real) hypre_Rand();
       values[j]= (HYPRE_Real) j;
    }
    for (part = 0; part < data.nparts; part++)
@@ -1900,6 +1902,7 @@ main( hypre_int argc,
    hypre_FinalizeMemoryDebug();
 
    /* Finalize MPI */
+   hypre_GPUFinalize();
    hypre_MPI_Finalize();
 
    return (0);
diff --git a/src/test/runtest.sh b/src/test/runtest.sh
index 17c439e..383aa5a 100755
--- a/src/test/runtest.sh
+++ b/src/test/runtest.sh
@@ -82,7 +82,7 @@ function MpirunString
          # RunString="${RunString} -nodes $POE_NUM_NODES $MY_ARGS"
          RunString="poe $MY_ARGS -rmpool pdebug -procs $POE_NUM_PROCS -nodes $POE_NUM_NODES"
          ;;
-      rzzeus*|rzmerl*|ansel*|aztec*|cab*|sierra*|vulcan*)
+      rztopaz*|aztec*|cab*|quartz*|sierra*|syrah*|vulcan*)
          shift
          if [ $NumThreads -gt 0 ] ; then
             export OMP_NUM_THREADS=$NumThreads
diff --git a/src/test/sstruct.c b/src/test/sstruct.c
index b5420f2..2de171c 100644
--- a/src/test/sstruct.c
+++ b/src/test/sstruct.c
@@ -466,7 +466,6 @@ ReadData( char         *filename,
          sdata_line = fgets((sdata + sdata_size), maxline, file);
       }
    }
-
    /* broadcast the data size */
    hypre_MPI_Bcast(&sdata_size, 1, HYPRE_MPI_INT, 0, hypre_MPI_COMM_WORLD);
 
@@ -2419,10 +2418,15 @@ main( hypre_int argc,
 
    /* Initialize MPI */
    hypre_MPI_Init(&argc, &argv);
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 10;
+   Kokkos::initialize (args);
+#endif
 
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs);
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid);
-
+   hypre_GPUInit(-1);
    hypre_InitMemoryDebug(myid);
 
    /*-----------------------------------------------------------
@@ -5734,6 +5738,10 @@ main( hypre_int argc,
    hypre_FinalizeMemoryDebug();
 
    /* Finalize MPI */
+   hypre_GPUFinalize();
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
    hypre_MPI_Finalize();
 
    return (0);
diff --git a/src/test/struct.c b/src/test/struct.c
index 9187bc0..e3d59f1 100644
--- a/src/test/struct.c
+++ b/src/test/struct.c
@@ -188,7 +188,12 @@ main( hypre_int argc,
 
    /* Initialize MPI */
    hypre_MPI_Init(&argc, &argv);
-
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 10;
+   Kokkos::initialize (args);
+#endif
+   
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
 
@@ -839,7 +844,7 @@ main( hypre_int argc,
        * Set up the stencil structure needed for matrix creation
        * which is always the case for read_fromfile_param == 0
        *-----------------------------------------------------------*/
- 
+      
       HYPRE_StructStencilCreate(dim, (2-sym)*dim + 1, &stencil);
       for (s = 0; s < (2-sym)*dim + 1; s++)
       {
@@ -884,7 +889,6 @@ main( hypre_int argc,
            (read_rhsfromfile_param ==1) 
          )
       {
-         hypre_printf("\nreading linear system from files: matrix, rhs and x0\n");
          /* ghost selection for reading the matrix and vectors */
          for (i = 0; i < dim; i++)
          {
@@ -970,12 +974,14 @@ main( hypre_int argc,
          HYPRE_StructGridSetPeriodic(grid, periodic);
          HYPRE_StructGridSetNumGhost(grid, num_ghost);
          HYPRE_StructGridAssemble(grid);
-
+	 
+	 
          /*-----------------------------------------------------------
           * Set up the matrix structure
           *-----------------------------------------------------------*/
-
+	 
          HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, grid, stencil, &A);
+
          if ( solver_id == 3 || solver_id == 4 ||
               solver_id == 13 || solver_id == 14 )
          {
@@ -1008,13 +1014,14 @@ main( hypre_int argc,
                constant_coefficient = 2;
             }
          }
+
          HYPRE_StructMatrixSetSymmetric(A, sym);
          HYPRE_StructMatrixInitialize(A);
 
          /*-----------------------------------------------------------
           * Fill in the matrix elements
           *-----------------------------------------------------------*/
-   
+
          AddValuesMatrix(A,grid,cx,cy,cz,conx,cony,conz);
 
          /* Zero out stencils reaching to real boundary */
@@ -1022,7 +1029,6 @@ main( hypre_int argc,
 
          if ( constant_coefficient == 0 ) SetStencilBndry(A,grid,periodic); 
          HYPRE_StructMatrixAssemble(A);
-
          /*-----------------------------------------------------------
           * Set up the linear system
           *-----------------------------------------------------------*/
@@ -1041,7 +1047,7 @@ main( hypre_int argc,
 
          HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x);
          HYPRE_StructVectorInitialize(x);
-    
+
          AddValuesVector(grid,x,periodx0,0.0);
          HYPRE_StructVectorAssemble(x);
 
@@ -2765,6 +2771,9 @@ main( hypre_int argc,
    }
 
    /* Finalize MPI */
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
    hypre_MPI_Finalize();
 
    return (0);
@@ -2799,8 +2808,8 @@ AddValuesVector( hypre_StructGrid  *gridvector,
    {
       box      = hypre_BoxArrayBox(gridboxes, ib);
       volume   =  hypre_BoxVolume(box);
-      values   = hypre_CTAlloc(HYPRE_Real, volume);
-
+      values   = hypre_UMCTAlloc(HYPRE_Real, volume);
+      
       /*-----------------------------------------------------------
        * For periodic b.c. in all directions, need rhs to satisfy 
        * compatibility condition. Achieved by setting a source and
@@ -2827,8 +2836,10 @@ AddValuesVector( hypre_StructGrid  *gridvector,
 
       ilower = hypre_BoxIMin(box);
       iupper = hypre_BoxIMax(box);
+	  
       HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values);
-      hypre_TFree(values);
+      hypre_UMTFree(values);
+      
 
    }
 
@@ -2900,8 +2911,8 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
-
+	    values     = hypre_UMCTAlloc(HYPRE_Real, stencil_size*volume);
+	    
             for (i = 0; i < stencil_size*volume; i += stencil_size)
             {
                switch (dim)
@@ -2925,14 +2936,16 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
             }
             ilower = hypre_BoxIMin(box);
             iupper = hypre_BoxIMax(box);
+	    
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
                                            stencil_indices, values);
-            hypre_TFree(values);
+	    hypre_UMTFree(values);
+	    
          }
       }
       else if ( constant_coefficient==1 )
       {
-         values   = hypre_CTAlloc(HYPRE_Real, stencil_size);
+	 values   = hypre_CTAlloc(HYPRE_Real, stencil_size);
          switch (dim)
          {
             case 1:
@@ -2963,7 +2976,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
          hypre_assert( constant_coefficient==2 );
 
          /* stencil index for the center equals dim, so it's easy to leave out */
-         values   = hypre_CTAlloc(HYPRE_Real, stencil_size-1);
+	 values   = hypre_UMCTAlloc(HYPRE_Real, stencil_size-1);
          switch (dim)
          {
             case 1:
@@ -2984,14 +2997,13 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
             HYPRE_StructMatrixSetConstantValues(A, stencil_size-1,
                                                 stencil_indices, values);
          }
-         hypre_TFree(values);
+	 hypre_UMTFree(values);
 
          hypre_ForBoxI(bi, gridboxes)
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, volume);
-
+	    values   = hypre_UMCTAlloc(HYPRE_Real, volume);
             for ( i=0; i < volume; ++i )
             {
                values[i] = center;
@@ -3000,7 +3012,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
             iupper = hypre_BoxIMax(box);
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
                                            stencil_indices+dim, values);
-            hypre_TFree(values);
+	    hypre_UMTFree(values);
          }
       }
    }
@@ -3043,7 +3055,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
+            values   = hypre_UMCTAlloc(HYPRE_Real, stencil_size*volume);
 
             for (i = 0; i < stencil_size*volume; i += stencil_size)
             {
@@ -3077,7 +3089,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
                                            stencil_indices, values);
 
-            hypre_TFree(values);
+            hypre_UMTFree(values);
          }
       }
       else if ( constant_coefficient==1 )
@@ -3120,7 +3132,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
       else
       {
          hypre_assert( constant_coefficient==2 );
-         values = hypre_CTAlloc( HYPRE_Real, stencil_size-1 );
+         values = hypre_UMCTAlloc( HYPRE_Real, stencil_size-1 );
          switch (dim)
          {  /* no center in stencil_indices and values */
             case 1:
@@ -3160,7 +3172,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
             HYPRE_StructMatrixSetConstantValues(A, stencil_size,
                                                 stencil_indices, values);
          }
-         hypre_TFree(values);
+         hypre_UMTFree(values);
 
 
          /* center is variable */
@@ -3169,7 +3181,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
          {
             box      = hypre_BoxArrayBox(gridboxes, bi);
             volume   =  hypre_BoxVolume(box);
-            values   = hypre_CTAlloc(HYPRE_Real, volume);
+            values   = hypre_UMCTAlloc(HYPRE_Real, volume);
 
             for ( i=0; i < volume; ++i )
             {
@@ -3179,7 +3191,7 @@ AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
             iupper = hypre_BoxIMax(box);
             HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
                                            stencil_indices, values);
-            hypre_TFree(values);
+            hypre_UMTFree(values);
          }
       }
    }
@@ -3260,7 +3272,7 @@ SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* peri
       {
          for (ib = 0; ib < size; ib++)
          {
-            values = hypre_CTAlloc(HYPRE_Real, vol[ib]);
+	    values = hypre_UMCTAlloc(HYPRE_Real, vol[ib]);
         
             for (i = 0; i < vol[ib]; i++)
             {
@@ -3286,7 +3298,7 @@ SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* peri
                                               1, stencil_indices, values);
                ilower[ib][d] = j;
             }
-            hypre_TFree(values);
+	    hypre_UMTFree(values);
          }
       }
    }
diff --git a/src/test/struct_migrate.c b/src/test/struct_migrate.c
index d688832..6dcfccf 100644
--- a/src/test/struct_migrate.c
+++ b/src/test/struct_migrate.c
@@ -65,7 +65,11 @@ main( hypre_int argc,
 
    /* Initialize MPI */
    hypre_MPI_Init(&argc, &argv);
-
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 10;
+   Kokkos::initialize (args);
+#endif
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
 
@@ -362,7 +366,7 @@ main( hypre_int argc,
 
    if (myid == 0)
    {
-      printf("\nCheck = %1.0f (success = 0)\n\n", check);
+      hypre_printf("\nCheck = %1.0f (success = 0)\n\n", check);
    }
 
    /*-----------------------------------------------------------
@@ -397,6 +401,9 @@ main( hypre_int argc,
    HYPRE_StructVectorDestroy(check_vector);
 
    /* Finalize MPI */
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
    hypre_MPI_Finalize();
 
    return (0);
@@ -413,7 +420,7 @@ AddValuesVector( hypre_StructGrid   *grid,
 {
    HYPRE_Int          ierr = 0;
    hypre_BoxArray    *gridboxes;
-   HYPRE_Int          i,ib;
+   HYPRE_Int          ib;
    hypre_IndexRef     ilower;
    hypre_IndexRef     iupper;
    hypre_Box         *box;
@@ -426,18 +433,19 @@ AddValuesVector( hypre_StructGrid   *grid,
    hypre_ForBoxI(ib, gridboxes)
    {
       box      = hypre_BoxArrayBox(gridboxes, ib);
-      volume   =  hypre_BoxVolume(box);
-      values   = hypre_CTAlloc(HYPRE_Real, volume);
+      volume   = hypre_BoxVolume(box);
+      values   = hypre_DeviceCTAlloc(HYPRE_Real, volume);
 
-      for (i = 0; i < volume; i++)
+      hypre_LoopBegin(volume,i)
       {
          values[i] = value;
       }
-
+      hypre_LoopEnd();
+	
       ilower = hypre_BoxIMin(box);
       iupper = hypre_BoxIMax(box);
       HYPRE_StructVectorSetBoxValues(vector, ilower, iupper, values);
-      hypre_TFree(values);
+      hypre_DeviceTFree(values);
    }
 
    return ierr;
diff --git a/src/test/struct_newboxloop.c b/src/test/struct_newboxloop.c
new file mode 100644
index 0000000..26a2fdc
--- /dev/null
+++ b/src/test/struct_newboxloop.c
@@ -0,0 +1,1956 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "_hypre_utilities.h"
+#include "HYPRE_struct_ls.h"
+#include "HYPRE_krylov.h"
+
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+#define HYPRE_MFLOPS 0
+#if HYPRE_MFLOPS
+#include "_hypre_struct_mv.h"
+#endif
+
+/* RDF: Why is this include here? */
+#include "_hypre_struct_mv.h"
+
+#ifdef HYPRE_DEBUG
+#include <cegdb.h>
+#endif
+
+/* begin lobpcg */
+
+#define NO_SOLVER -9198
+
+#include <time.h>
+ 
+#include "fortran_matrix.h"
+#include "HYPRE_lobpcg.h"
+#include "interpreter.h"
+#include "multivector.h"
+#include "HYPRE_MatvecFunctions.h"
+
+/* end lobpcg */
+
+HYPRE_Int  SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* period);
+
+HYPRE_Int  AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
+                           HYPRE_Real        cx,
+                           HYPRE_Real        cy,
+                           HYPRE_Real        cz,
+                           HYPRE_Real        conx,
+                           HYPRE_Real        cony,
+                           HYPRE_Real        conz) ;
+
+HYPRE_Int AddValuesVector( hypre_StructGrid  *gridvector,
+                           hypre_StructVector *zvector,
+                           HYPRE_Int          *period, 
+                           HYPRE_Real         value  )  ;
+
+/*--------------------------------------------------------------------------
+ * Test driver for structured matrix interface (structured storage)
+ *--------------------------------------------------------------------------*/
+ 
+/*----------------------------------------------------------------------
+ * Standard 7-point laplacian in 3D with grid and anisotropy determined
+ * as command line arguments.  Do `driver -help' for usage info.
+ *----------------------------------------------------------------------*/
+
+hypre_int
+main( hypre_int argc,
+      char *argv[] )
+{
+   HYPRE_Int           arg_index;
+   HYPRE_Int           print_usage;
+   HYPRE_Int           nx, ny, nz;
+   HYPRE_Int           P, Q, R;
+   HYPRE_Int           bx, by, bz;
+   HYPRE_Int           px, py, pz;
+   HYPRE_Real          cx, cy, cz;
+   HYPRE_Real          conx, cony, conz;
+   HYPRE_Int           solver_id;
+   HYPRE_Int           solver_type;
+
+   /*HYPRE_Real          dxyz[3];*/
+
+   HYPRE_Int           A_num_ghost[6] = {0, 0, 0, 0, 0, 0};
+   HYPRE_Int           v_num_ghost[3] = {0,0,0};
+                     
+   HYPRE_StructMatrix  A;
+   HYPRE_StructVector  b;
+   HYPRE_StructVector  x;
+
+   HYPRE_StructSolver  solver;
+   HYPRE_StructSolver  precond;
+   HYPRE_Int           num_iterations;
+   HYPRE_Int           time_index;
+   HYPRE_Real          final_res_norm;
+   HYPRE_Real          cf_tol;
+
+   HYPRE_Int           num_procs, myid;
+
+   HYPRE_Int           p, q, r;
+   HYPRE_Int           dim;
+   HYPRE_Int           n_pre, n_post;
+   HYPRE_Int           nblocks ;
+   HYPRE_Int           skip;
+   HYPRE_Int           sym;
+   HYPRE_Int           rap;
+   HYPRE_Int           relax;
+   HYPRE_Real          jacobi_weight;
+   HYPRE_Int           usr_jacobi_weight;
+   HYPRE_Int           jump;
+   HYPRE_Int           rep, reps;
+
+   HYPRE_Int         **iupper;
+   HYPRE_Int         **ilower;
+
+   HYPRE_Int           istart[3];
+   HYPRE_Int           periodic[3];
+   HYPRE_Int         **offsets;
+   HYPRE_Int           constant_coefficient = 0;
+   HYPRE_Int          *stencil_entries;
+   HYPRE_Int           stencil_size;
+   HYPRE_Int           diag_rank;
+   hypre_Index         diag_index;
+
+   HYPRE_StructGrid    grid;
+   HYPRE_StructGrid    readgrid;
+   HYPRE_StructStencil stencil;
+
+   HYPRE_Int           i, s;
+   HYPRE_Int           ix, iy, iz, ib;
+
+   HYPRE_Int           read_fromfile_param;
+   HYPRE_Int           read_fromfile_index;
+   HYPRE_Int           read_rhsfromfile_param;
+   HYPRE_Int           read_rhsfromfile_index;
+   HYPRE_Int           read_x0fromfile_param;
+   HYPRE_Int           read_x0fromfile_index;
+   HYPRE_Int           periodx0[3] = {0,0,0};
+   HYPRE_Int          *readperiodic;
+   HYPRE_Int           sum;
+   HYPRE_Int           inner;
+
+   HYPRE_Int           print_system = 0;
+
+   /* begin lobpcg */
+   
+   HYPRE_Int lobpcgFlag = 0;
+   HYPRE_Int lobpcgSeed = 0;
+   HYPRE_Int blockSize = 1;
+   HYPRE_Int verbosity = 1;
+   HYPRE_Int iterations;
+   HYPRE_Int maxIterations = 100;
+   HYPRE_Int checkOrtho = 0;
+   HYPRE_Int printLevel = 0;
+   HYPRE_Int pcgIterations = 0;
+   HYPRE_Int pcgMode = 0;
+   HYPRE_Real tol = 1e-6;
+   HYPRE_Real pcgTol = 1e-2;
+   HYPRE_Real nonOrthF;
+
+   FILE* filePtr;
+
+   mv_MultiVectorPtr eigenvectors = NULL;
+   mv_MultiVectorPtr constrains = NULL;
+   HYPRE_Real* eigenvalues = NULL;
+
+   HYPRE_Real* residuals;
+   utilities_FortranMatrix* residualNorms;
+   utilities_FortranMatrix* residualNormsHistory;
+   utilities_FortranMatrix* eigenvaluesHistory;
+   utilities_FortranMatrix* printBuffer;
+   utilities_FortranMatrix* gramXX;
+   utilities_FortranMatrix* identity;
+
+   HYPRE_StructSolver        lobpcg_solver;
+
+   mv_InterfaceInterpreter* interpreter;
+   HYPRE_MatvecFunctions matvec_fn;
+   /* end lobpcg */
+
+   /*-----------------------------------------------------------
+    * Initialize some stuff
+    *-----------------------------------------------------------*/
+
+   /* Initialize MPI */
+   hypre_MPI_Init(&argc, &argv);
+   
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::InitArguments args;
+   args.num_threads = 12;
+   Kokkos::initialize (args);
+#endif
+   
+   hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
+   hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+
+
+#ifdef HYPRE_DEBUG
+   cegdb(&argc, &argv, myid);
+#endif
+
+   hypre_InitMemoryDebug(myid);
+
+   /*-----------------------------------------------------------
+    * Set defaults
+    *-----------------------------------------------------------*/
+ 
+   dim = 3;
+
+   skip  = 0;
+   sym  = 1;
+   rap = 0;
+   relax = 1;
+   usr_jacobi_weight= 0;
+   jump  = 0;
+   reps = 1;
+
+   nx = 10;
+   ny = 10;
+   nz = 10;
+
+   P  = num_procs;
+   Q  = 1;
+   R  = 1;
+
+   bx = 1;
+   by = 1;
+   bz = 1;
+
+   cx = 1.0;
+   cy = 1.0;
+   cz = 1.0;
+   conx = 0.0;
+   cony = 0.0;
+   conz = 0.0;
+
+   n_pre  = 1;
+   n_post = 1;
+
+   solver_id = 0;
+   solver_type = 1;
+
+   istart[0] = -3;
+   istart[1] = -3;
+   istart[2] = -3;
+
+   px = 0;
+   py = 0;
+   pz = 0;
+
+   cf_tol = 0.90;
+
+   /* setting defaults for the reading parameters    */
+   read_fromfile_param = 0;
+   read_fromfile_index = argc;
+   read_rhsfromfile_param = 0;
+   read_rhsfromfile_index = argc;
+   read_x0fromfile_param = 0;
+   read_x0fromfile_index = argc;
+   sum = 0;
+
+   /* ghosts for the building of matrix: default  */
+   for (i = 0; i < dim; i++)
+   {
+      A_num_ghost[2*i] = 1;
+      A_num_ghost[2*i + 1] = 1;
+   }
+
+   /*-----------------------------------------------------------
+    * Parse command line
+    *-----------------------------------------------------------*/
+ 
+   print_usage = 0;
+   arg_index = 1;
+   while (arg_index < argc)
+   {
+      if ( strcmp(argv[arg_index], "-n") == 0 )
+      {
+         arg_index++;
+         nx = atoi(argv[arg_index++]);
+         ny = atoi(argv[arg_index++]);
+         nz = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-istart") == 0 )
+      {
+         arg_index++;
+         istart[0] = atoi(argv[arg_index++]);
+         istart[1] = atoi(argv[arg_index++]);
+         istart[2] = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-P") == 0 )
+      {
+         arg_index++;
+         P  = atoi(argv[arg_index++]);
+         Q  = atoi(argv[arg_index++]);
+         R  = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-b") == 0 )
+      {
+         arg_index++;
+         bx = atoi(argv[arg_index++]);
+         by = atoi(argv[arg_index++]);
+         bz = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-p") == 0 )
+      {
+         arg_index++;
+         px = atoi(argv[arg_index++]);
+         py = atoi(argv[arg_index++]);
+         pz = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-c") == 0 )
+      {
+         arg_index++;
+         cx = atof(argv[arg_index++]);
+         cy = atof(argv[arg_index++]);
+         cz = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-convect") == 0 )
+      {
+         arg_index++;
+         conx = atof(argv[arg_index++]);
+         cony = atof(argv[arg_index++]);
+         conz = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-d") == 0 )
+      {
+         arg_index++;
+         dim = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-fromfile") == 0 )
+      {
+         arg_index++;
+         read_fromfile_param = 1;
+         read_fromfile_index = arg_index;
+      }
+      else if ( strcmp(argv[arg_index], "-rhsfromfile") == 0 )
+      {
+         arg_index++;
+         read_rhsfromfile_param = 1;
+         read_rhsfromfile_index = arg_index;
+      }
+      else if ( strcmp(argv[arg_index], "-x0fromfile") == 0 )
+      {
+         arg_index++;
+         read_x0fromfile_param = 1;
+         read_x0fromfile_index = arg_index;
+      }
+      else if (strcmp(argv[arg_index], "-repeats") == 0 )
+      {
+         arg_index++;
+         reps = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-solver") == 0 )
+      {
+         arg_index++;
+
+	 /* begin lobpcg */
+	 if ( strcmp(argv[arg_index], "none") == 0 ) {
+            solver_id = NO_SOLVER;
+            arg_index++;
+	 }
+	 else /* end lobpcg */
+            solver_id = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-v") == 0 )
+      {
+         arg_index++;
+         n_pre = atoi(argv[arg_index++]);
+         n_post = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-rap") == 0 )
+      {
+         arg_index++;
+         rap = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-relax") == 0 )
+      {
+         arg_index++;
+         relax = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-w") == 0 )
+      {
+         arg_index++;
+         jacobi_weight= atof(argv[arg_index++]);
+         usr_jacobi_weight= 1; /* flag user weight */
+      }
+      else if ( strcmp(argv[arg_index], "-sym") == 0 )
+      {
+         arg_index++;
+         sym = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-skip") == 0 )
+      {
+         arg_index++;
+         skip = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-jump") == 0 )
+      {
+         arg_index++;
+         jump = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-solver_type") == 0 )
+      {
+         arg_index++;
+         solver_type = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-cf") == 0 )
+      {
+         arg_index++;
+         cf_tol = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-print") == 0 )
+      {
+         arg_index++;
+         print_system = 1;
+      }
+      else if ( strcmp(argv[arg_index], "-help") == 0 )
+      {
+         print_usage = 1;
+         break;
+      }
+      /* begin lobpcg */
+      else if ( strcmp(argv[arg_index], "-lobpcg") == 0 ) 
+      {				         /* use lobpcg */
+         arg_index++;
+	 lobpcgFlag = 1;
+      }
+      else if ( strcmp(argv[arg_index], "-orthchk") == 0 )
+      {			/* lobpcg: check orthonormality */
+         arg_index++;
+	 checkOrtho = 1;
+      }
+      else if ( strcmp(argv[arg_index], "-verb") == 0 ) 
+      {			  /* lobpcg: verbosity level */
+         arg_index++;
+         verbosity = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-vrand") == 0 ) 
+      {                         /* lobpcg: block size */
+         arg_index++;
+         blockSize = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-seed") == 0 )
+      {		           /* lobpcg: seed for srand */
+         arg_index++;
+         lobpcgSeed = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-itr") == 0 ) 
+      {		     /* lobpcg: max # of iterations */
+         arg_index++;
+         maxIterations = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-tol") == 0 ) 
+      {		               /* lobpcg: tolerance */
+         arg_index++;
+         tol = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-pcgitr") == 0 ) 
+      {		   /* lobpcg: max inner pcg iterations */
+         arg_index++;
+         pcgIterations = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-pcgtol") == 0 ) 
+      {	     /* lobpcg: inner pcg iterations tolerance */
+         arg_index++;
+         pcgTol = atof(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-pcgmode") == 0 ) 
+      {		 /* lobpcg: initial guess for inner pcg */
+         arg_index++;	      /* 0: zero, otherwise rhs */
+         pcgMode = atoi(argv[arg_index++]);
+      }
+      else if ( strcmp(argv[arg_index], "-vout") == 0 )
+      {			      /* lobpcg: print level */
+         arg_index++;
+         printLevel = atoi(argv[arg_index++]);
+      }
+      /* end lobpcg */
+      else
+      {
+         arg_index++;
+      }
+   }
+
+   /* begin lobpcg */
+
+   if ( solver_id == 0 && lobpcgFlag )
+      solver_id = 10;
+
+   /*end lobpcg */
+
+   sum = read_x0fromfile_param + read_rhsfromfile_param + read_fromfile_param; 
+
+   /*-----------------------------------------------------------
+    * Print usage info
+    *-----------------------------------------------------------*/
+ 
+   if ( (print_usage) && (myid == 0) )
+   {
+      hypre_printf("\n");
+      hypre_printf("Usage: %s [<options>]\n", argv[0]);
+      hypre_printf("\n");
+      hypre_printf("  -n <nx> <ny> <nz>   : problem size per block\n");
+      hypre_printf("  -istart <istart[0]> <istart[1]> <istart[2]> : start of box\n");
+      hypre_printf("  -P <Px> <Py> <Pz>   : processor topology\n");
+      hypre_printf("  -b <bx> <by> <bz>   : blocking per processor\n");
+      hypre_printf("  -p <px> <py> <pz>   : periodicity in each dimension\n");
+      hypre_printf("  -c <cx> <cy> <cz>   : diffusion coefficients\n");
+      hypre_printf("  -convect <x> <y> <z>: convection coefficients\n");
+      hypre_printf("  -d <dim>            : problem dimension (2 or 3)\n");
+      hypre_printf("  -fromfile <name>    : prefix name for matrixfiles\n");
+      hypre_printf("  -rhsfromfile <name> : prefix name for rhsfiles\n");
+      hypre_printf("  -x0fromfile <name>  : prefix name for firstguessfiles\n");
+      hypre_printf("  -repeats <reps>     : number of times to repeat the run, default 1.\n");
+      hypre_printf("  -solver <ID>        : solver ID\n");
+      hypre_printf("                        0  - axpy\n");
+      hypre_printf("                        1  - spMV\n");
+      hypre_printf("                        2  - inner product\n");
+	  hypre_printf("                        3  - spMV with constant coeffs\n");
+	  hypre_printf("                        4  - spMV with constant coeffs var diag\n");
+      hypre_printf("                        8  - Jacobi\n");
+      hypre_printf("  -sym <s>            : symmetric storage (1) or not (0)\n");
+      hypre_printf("  -jump <num>         : num levels to jump in SparseMSG\n");
+      hypre_printf("\n");
+   }
+
+   if ( print_usage )
+   {
+      exit(1);
+   }
+
+   /*-----------------------------------------------------------
+    * Check a few things
+    *-----------------------------------------------------------*/
+
+   if ((P*Q*R) > num_procs)
+   {
+      if (myid == 0)
+      {
+         hypre_printf("Error: PxQxR is more than the number of processors\n");
+      }
+      exit(1);
+   }
+   else if ((P*Q*R) < num_procs)
+   {
+      if (myid == 0)
+      {
+         hypre_printf("Warning: PxQxR is less than the number of processors\n");
+      }
+   }
+
+   if ((conx != 0.0 || cony !=0 || conz != 0) && sym == 1 )
+   {
+      if (myid == 0)
+      {
+         hypre_printf("Warning: Convection produces non-symmetric matrix\n");
+      }
+      sym = 0;
+   }
+
+   /*-----------------------------------------------------------
+    * Print driver parameters
+    *-----------------------------------------------------------*/
+ 
+   if (myid == 0 && sum == 0)
+   {
+#ifdef HYPRE_USE_DEFAULT
+	   hypre_printf("Running with openMP macro\n");
+#endif
+#ifdef HYPRE_USE_KOKKOS
+	   hypre_printf("Running with Kokkos macro\n");
+#endif
+#ifdef HYPRE_USE_CUDA
+	   hypre_printf("Running with CUDA macro\n");
+#endif
+#ifdef HYPRE_USE_RAJA
+	   hypre_printf("Running with CUDA macro\n");
+#endif
+#ifdef  HYPRE_USE_KOKKOS_CUDA
+	   hypre_printf("Running kokkos with CUDA macro\n");
+#endif
+	   
+      hypre_printf("Running with these driver parameters:\n");
+      hypre_printf("  (nx, ny, nz)    = (%d, %d, %d)\n", nx, ny, nz);
+      hypre_printf("  (istart[0],istart[1],istart[2]) = (%d, %d, %d)\n", \
+                   istart[0],istart[1],istart[2]);
+      hypre_printf("  (Px, Py, Pz)    = (%d, %d, %d)\n", P,  Q,  R);
+      hypre_printf("  (bx, by, bz)    = (%d, %d, %d)\n", bx, by, bz);
+      hypre_printf("  (px, py, pz)    = (%d, %d, %d)\n", px, py, pz);
+      hypre_printf("  (cx, cy, cz)    = (%f, %f, %f)\n", cx, cy, cz);
+      hypre_printf("  (conx,cony,conz)= (%f, %f, %f)\n", conx, cony, conz);
+      hypre_printf("  (n_pre, n_post) = (%d, %d)\n", n_pre, n_post);
+      hypre_printf("  dim             = %d\n", dim);
+      hypre_printf("  skip            = %d\n", skip);
+      hypre_printf("  sym             = %d\n", sym);
+      hypre_printf("  rap             = %d\n", rap);
+      hypre_printf("  relax           = %d\n", relax);
+      hypre_printf("  jump            = %d\n", jump);
+      hypre_printf("  solver ID       = %d\n", solver_id);
+   }
+
+   if (myid == 0 && sum > 0)
+   {
+      hypre_printf("Running with these driver parameters:\n");
+      hypre_printf("  (cx, cy, cz)    = (%f, %f, %f)\n", cx, cy, cz);
+      hypre_printf("  (conx,cony,conz)= (%f, %f, %f)\n", conx, cony, conz);
+      hypre_printf("  (n_pre, n_post) = (%d, %d)\n", n_pre, n_post);
+      hypre_printf("  dim             = %d\n", dim);
+      hypre_printf("  skip            = %d\n", skip);
+      hypre_printf("  sym             = %d\n", sym);
+      hypre_printf("  rap             = %d\n", rap);
+      hypre_printf("  relax           = %d\n", relax);
+      hypre_printf("  jump            = %d\n", jump);
+      hypre_printf("  solver ID       = %d\n", solver_id);
+      hypre_printf("  the grid is read from  file \n");
+	     
+   }
+  
+   /*-----------------------------------------------------------
+	* Set up the stencil structure (7 points) when matrix is NOT read from file
+	* Set up the grid structure used when NO files are read
+	*-----------------------------------------------------------*/
+   
+   switch (dim)
+   {
+   case 1:
+	   nblocks = bx;
+	   if(sym)
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 2);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[0][0] = -1; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[1][0] = 0; 
+	   }
+	   else
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 3);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[0][0] = -1;
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[1][0] = 0;
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 1);
+		   offsets[2][0] = 1;
+	   }
+	   /* compute p from P and myid */
+	   p = myid % P;
+	   break;
+	   
+   case 2:
+	   nblocks = bx*by;
+	   if(sym)
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 3);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+	   }
+	   else
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 5);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+		   offsets[3] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[3][0] = 1; 
+		   offsets[3][1] = 0; 
+		   offsets[4] = hypre_CTAlloc(HYPRE_Int, 2);
+		   offsets[4][0] = 0; 
+		   offsets[4][1] = 1; 
+	   }
+	   /* compute p,q from P,Q and myid */
+	   p = myid % P;
+	   q = (( myid - p)/P) % Q;
+	   break;
+	   
+   case 3:
+	   nblocks = bx*by*bz;
+	   if(sym)
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 4);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[0][2] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[1][2] = 0; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+		   offsets[2][2] = -1; 
+		   offsets[3] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[3][0] = 0; 
+		   offsets[3][1] = 0; 
+		   offsets[3][2] = 0; 
+	   }
+	   else
+	   {
+		   offsets = hypre_CTAlloc(HYPRE_Int*, 7);
+		   offsets[0] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[0][0] = -1; 
+		   offsets[0][1] = 0; 
+		   offsets[0][2] = 0; 
+		   offsets[1] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[1][0] = 0; 
+		   offsets[1][1] = -1; 
+		   offsets[1][2] = 0; 
+		   offsets[2] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[2][0] = 0; 
+		   offsets[2][1] = 0; 
+		   offsets[2][2] = -1; 
+		   offsets[3] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[3][0] = 0; 
+		   offsets[3][1] = 0; 
+		   offsets[3][2] = 0; 
+		   offsets[4] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[4][0] = 1; 
+		   offsets[4][1] = 0; 
+		   offsets[4][2] = 0; 
+		   offsets[5] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[5][0] = 0; 
+		   offsets[5][1] = 1; 
+		   offsets[5][2] = 0; 
+		   offsets[6] = hypre_CTAlloc(HYPRE_Int, 3);
+		   offsets[6][0] = 0; 
+		   offsets[6][1] = 0; 
+		   offsets[6][2] = 1; 
+	   }
+	   /* compute p,q,r from P,Q,R and myid */
+	   p = myid % P;
+	   q = (( myid - p)/P) % Q;
+	   r = ( myid - p - P*q)/( P*Q );
+	   break;
+   }
+   
+   if (myid >= (P*Q*R))
+   {
+	   /* My processor has no data on it */
+	   nblocks = bx = by = bz = 0;
+   }
+   
+   /*-----------------------------------------------------------
+	* Set up the stencil structure needed for matrix creation
+	* which is always the case for read_fromfile_param == 0
+	*-----------------------------------------------------------*/
+   
+   HYPRE_StructStencilCreate(dim, (2-sym)*dim + 1, &stencil);
+   for (s = 0; s < (2-sym)*dim + 1; s++)
+   {
+	   HYPRE_StructStencilSetElement(stencil, s, offsets[s]);
+   }
+   
+   /*-----------------------------------------------------------
+	* Set up periodic
+	*-----------------------------------------------------------*/
+   
+   periodic[0] = px;
+   periodic[1] = py;
+   periodic[2] = pz;
+   
+   /*-----------------------------------------------------------
+	* Set up dxyz for PFMG solver
+	*-----------------------------------------------------------*/
+
+   /* We do the extreme cases first reading everything from files => sum = 3
+	* building things from scratch (grid,stencils,extents) sum = 0 */
+
+   if ( (read_fromfile_param ==1) &&
+		(read_x0fromfile_param ==1) &&
+		(read_rhsfromfile_param ==1) 
+	   )
+   {
+	   hypre_printf("\nreading linear system from files: matrix, rhs and x0\n");
+	   /* ghost selection for reading the matrix and vectors */
+	   for (i = 0; i < dim; i++)
+	   {
+		   A_num_ghost[2*i] = 1;
+		   A_num_ghost[2*i + 1] = 1;
+		   v_num_ghost[2*i] = 1;
+		   v_num_ghost[2*i + 1] = 1;
+	   }
+	   
+	   A = (HYPRE_StructMatrix)
+		   hypre_StructMatrixRead(hypre_MPI_COMM_WORLD,
+								  argv[read_fromfile_index],A_num_ghost);
+	   
+	   b = (HYPRE_StructVector)
+		   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+								  argv[read_rhsfromfile_index],v_num_ghost);
+	   
+	   x = (HYPRE_StructVector)
+		   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+								  argv[read_x0fromfile_index],v_num_ghost);
+   }
+   
+   /* beginning of sum == 0  */
+   if (sum == 0)    /* no read from any file */
+   {
+	   /*-----------------------------------------------------------
+		* prepare space for the extents
+		*-----------------------------------------------------------*/
+	   
+	   ilower = hypre_CTAlloc(HYPRE_Int*, nblocks);
+	   iupper = hypre_CTAlloc(HYPRE_Int*, nblocks);
+	   for (i = 0; i < nblocks; i++)
+	   {
+		   ilower[i] = hypre_CTAlloc(HYPRE_Int, dim);
+		   iupper[i] = hypre_CTAlloc(HYPRE_Int, dim);
+	   }
+	   
+	   /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */
+	   ib = 0;
+	   switch (dim)
+	   {
+	   case 1:
+		   for (ix = 0; ix < bx; ix++)
+		   {
+			   ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+			   iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+			   ib++;
+		   }
+		   break;
+	   case 2:
+		   for (iy = 0; iy < by; iy++)
+			   for (ix = 0; ix < bx; ix++)
+			   {
+				   ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+				   iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+				   ilower[ib][1] = istart[1]+ ny*(by*q+iy);
+				   iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
+				   ib++;
+			   }
+		   break;
+	   case 3:
+		   for (iz = 0; iz < bz; iz++)
+			   for (iy = 0; iy < by; iy++)
+				   for (ix = 0; ix < bx; ix++)
+				   {
+					   ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+					   iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+					   ilower[ib][1] = istart[1]+ ny*(by*q+iy);
+					   iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
+					   ilower[ib][2] = istart[2]+ nz*(bz*r+iz);
+					   iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1;
+					   ib++;
+				   }
+		   break;
+	   }
+	   
+	   HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid);
+	   for (ib = 0; ib < nblocks; ib++)
+	   {
+		   /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/
+		   HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]);
+	   }
+	   HYPRE_StructGridSetPeriodic(grid, periodic);
+	   HYPRE_StructGridAssemble(grid);
+	   
+	   /*-----------------------------------------------------------
+		* Set up the matrix structure
+		*-----------------------------------------------------------*/
+	   
+	   for (i = 0; i < dim; i++)
+	   {
+		   A_num_ghost[2*i] = 1;
+		   A_num_ghost[2*i + 1] = 1;
+	   }
+	   
+	   HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD, grid, stencil, &A);
+	   if ( solver_id == 3 || solver_id == 4 ||
+			solver_id == 13 || solver_id == 14 )
+	   {
+		   stencil_size  = hypre_StructStencilSize(stencil);
+		   stencil_entries = hypre_CTAlloc(HYPRE_Int, stencil_size);
+		   if ( solver_id == 3 || solver_id == 13)
+		   {
+               for ( i=0; i<stencil_size; ++i ) stencil_entries[i]=i;
+               hypre_StructMatrixSetConstantEntries(
+				   A, stencil_size, stencil_entries );
+               /* ... note: SetConstantEntries is where the constant_coefficient
+                  flag is set in A */
+               hypre_TFree( stencil_entries );
+               constant_coefficient = 1;
+		   }
+		   if ( solver_id == 4 || solver_id == 14)
+		   {
+               hypre_SetIndex3(diag_index, 0, 0, 0);
+               diag_rank = hypre_StructStencilElementRank( stencil, diag_index );
+               hypre_assert( stencil_size>=1 );
+               if ( diag_rank==0 ) stencil_entries[diag_rank]=1;
+               else stencil_entries[diag_rank]=0;
+               for ( i=0; i<stencil_size; ++i )
+               {
+                  if ( i!= diag_rank ) stencil_entries[i]=i;
+               }
+               hypre_StructMatrixSetConstantEntries(
+                  A, stencil_size, stencil_entries );
+               hypre_TFree( stencil_entries );
+               constant_coefficient = 2;
+		   }
+	   }
+	   HYPRE_StructMatrixSetSymmetric(A, sym);
+	   HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+	   HYPRE_StructMatrixInitialize(A);
+	   
+	   /*-----------------------------------------------------------
+		* Fill in the matrix elements
+		*-----------------------------------------------------------*/
+	   
+	   AddValuesMatrix(A,grid,cx,cy,cz,conx,cony,conz);
+	   
+	   /* Zero out stencils reaching to real boundary */
+	   /* But in constant coefficient case, no special stencils! */
+	   
+	   if ( constant_coefficient == 0 ) SetStencilBndry(A,grid,periodic); 
+	   HYPRE_StructMatrixAssemble(A);
+	   
+	   /*-----------------------------------------------------------
+		* Set up the linear system
+		*-----------------------------------------------------------*/
+	   
+	   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &b);
+	   HYPRE_StructVectorInitialize(b);
+	   
+	   /*-----------------------------------------------------------
+		* For periodic b.c. in all directions, need rhs to satisfy 
+		* compatibility condition. Achieved by setting a source and
+		*  sink of equal strength.  All other problems have rhs = 1.
+		*-----------------------------------------------------------*/
+	   
+	   AddValuesVector(grid,b,periodic,1.0);
+	   HYPRE_StructVectorAssemble(b);
+	   
+	   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x);
+	   HYPRE_StructVectorInitialize(x);
+	   
+	   AddValuesVector(grid,x,periodx0,1.0);
+	   HYPRE_StructVectorAssemble(x);
+	   
+	   HYPRE_StructGridDestroy(grid);
+	   
+	   for (i = 0; i < nblocks; i++)
+	   {
+		   hypre_TFree(iupper[i]);
+		   hypre_TFree(ilower[i]);
+	   }
+	   hypre_TFree(ilower);
+	   hypre_TFree(iupper);
+   }
+   
+   /* the grid will be read from file.  */
+   if ( (sum > 0 ) && (sum < 3))
+   {
+	   /* the grid will come from rhs or from x0 */
+	   if (read_fromfile_param == 0)
+	   {
+		   
+		   if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param == 0))
+		   {                     
+               /* read right hand side, extract grid, construct matrix,
+                  construct x0 */
+			   
+               hypre_printf("\ninitial rhs from file prefix :%s\n",
+                            argv[read_rhsfromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               b = (HYPRE_StructVector)
+                  hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_rhsfromfile_index],
+                                         v_num_ghost);
+           
+               readgrid = hypre_StructVectorGrid(b) ;
+               readperiodic = hypre_StructGridPeriodic(readgrid);  
+           
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &x);
+               HYPRE_StructVectorInitialize(x);
+           
+               AddValuesVector(readgrid,x,periodx0,0.0);
+               HYPRE_StructVectorAssemble(x);
+           
+               HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD,
+                                        readgrid, stencil, &A);
+               HYPRE_StructMatrixSetSymmetric(A, 1);
+               HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+               HYPRE_StructMatrixInitialize(A);
+
+               /*-----------------------------------------------------------
+                * Fill in the matrix elements
+                *-----------------------------------------------------------*/
+   
+               AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz);
+			   
+               /* Zero out stencils reaching to real boundary */
+           
+               if ( constant_coefficient==0 )
+				   SetStencilBndry(A,readgrid,readperiodic); 
+               HYPRE_StructMatrixAssemble(A);
+		   }   
+		   /* done with one case rhs=1 x0 = 0 */
+		   
+		   /* case when rhs=0 and read x0=1 */
+		   if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param > 0))
+		   {                     
+               /* read right hand side, extract grid, construct matrix,
+                  construct x0 */
+			   
+               hypre_printf("\ninitial x0 from file prefix :%s\n",
+                            argv[read_x0fromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               x = (HYPRE_StructVector)
+				   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+										  argv[read_x0fromfile_index],v_num_ghost);
+			   
+               readgrid = hypre_StructVectorGrid(x) ;
+               readperiodic = hypre_StructGridPeriodic(readgrid);  
+
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b);
+               HYPRE_StructVectorInitialize(b);
+               AddValuesVector(readgrid,b,readperiodic,1.0);
+
+               HYPRE_StructVectorAssemble(b);
+
+               HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD,
+                                        readgrid, stencil, &A);
+               HYPRE_StructMatrixSetSymmetric(A, 1);
+               HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+               HYPRE_StructMatrixInitialize(A);
+
+               /*-----------------------------------------------------------
+                * Fill in the matrix elements
+                *-----------------------------------------------------------*/
+   
+               AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz);
+
+               /* Zero out stencils reaching to real boundary */
+			   
+               if ( constant_coefficient == 0 )
+				   SetStencilBndry(A,readgrid,readperiodic); 
+               HYPRE_StructMatrixAssemble(A);
+		   }
+		   /* done with one case rhs=0 x0 = 1  */
+		   
+		   /* the other case when read rhs > 0 and read x0 > 0  */
+		   if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param > 0))
+		   {                    
+               /* read right hand side, extract grid, construct matrix,
+                  construct x0 */
+			   
+               hypre_printf("\ninitial rhs  from file prefix :%s\n",
+                            argv[read_rhsfromfile_index]);
+               hypre_printf("\ninitial x0  from file prefix :%s\n",
+                            argv[read_x0fromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               b = (HYPRE_StructVector)
+				   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_rhsfromfile_index],
+                                         v_num_ghost);
+
+               x = (HYPRE_StructVector)
+                  hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_x0fromfile_index],
+                                         v_num_ghost);
+
+               readgrid= hypre_StructVectorGrid(b) ;
+               readperiodic = hypre_StructGridPeriodic(readgrid); 
+
+               HYPRE_StructMatrixCreate(hypre_MPI_COMM_WORLD,
+                                        readgrid, stencil, &A);
+               HYPRE_StructMatrixSetSymmetric(A, 1);
+               HYPRE_StructMatrixSetNumGhost(A, A_num_ghost);
+               HYPRE_StructMatrixInitialize(A);
+
+               /*-----------------------------------------------------------
+                * Fill in the matrix elements
+                *-----------------------------------------------------------*/
+   
+               AddValuesMatrix(A,readgrid,cx,cy,cz,conx,cony,conz);
+
+               /* Zero out stencils reaching to real boundary */
+
+               if ( constant_coefficient == 0 )
+                  SetStencilBndry(A,readgrid,readperiodic); 
+               HYPRE_StructMatrixAssemble(A);
+		   }
+		   /* done with one case rhs=1 x0 = 1  */
+	   }
+	   /* done with the case where you no read matrix  */
+       
+	   if (read_fromfile_param == 1)  /* still sum > 0  */
+	   {   
+		   hypre_printf("\nreading matrix from file:%s\n",
+						argv[read_fromfile_index]);
+		   /* ghost selection for reading the matrix  */
+		   for (i = 0; i < dim; i++)
+		   {
+               A_num_ghost[2*i] = 1;
+               A_num_ghost[2*i + 1] = 1;
+		   }
+		   
+		   A = (HYPRE_StructMatrix)
+               hypre_StructMatrixRead(hypre_MPI_COMM_WORLD,
+                                      argv[read_fromfile_index], A_num_ghost);
+		   
+		   readgrid = hypre_StructMatrixGrid(A);
+		   readperiodic  =  hypre_StructGridPeriodic(readgrid);  
+		   
+		   if ((read_rhsfromfile_param > 0) && (read_x0fromfile_param == 0))
+		   {                
+               /* read right hand side ,construct x0 */
+               hypre_printf("\ninitial rhs from file prefix :%s\n",
+                            argv[read_rhsfromfile_index]);
+			   
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+				   v_num_ghost[2*i] = 1;
+				   v_num_ghost[2*i + 1] = 1;
+               }
+			   
+               b = (HYPRE_StructVector)
+				   hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+										  argv[read_rhsfromfile_index],
+										  v_num_ghost);
+			   
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid,&x);
+               HYPRE_StructVectorInitialize(x);
+               AddValuesVector(readgrid,x,periodx0,0.0);
+               HYPRE_StructVectorAssemble(x);
+		   }
+		   
+		   if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param > 0))
+		   {                   
+               /* read x0, construct rhs*/
+               hypre_printf("\ninitial x0 from file prefix :%s\n",
+                            argv[read_x0fromfile_index]);
+
+               /* ghost selection for vector  */
+               for (i = 0; i < dim; i++)
+               {
+                  v_num_ghost[2*i] = 1;
+                  v_num_ghost[2*i + 1] = 1;
+               }
+  
+               x = (HYPRE_StructVector)
+                  hypre_StructVectorRead(hypre_MPI_COMM_WORLD,
+                                         argv[read_x0fromfile_index],
+                                         v_num_ghost);
+
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b);
+               HYPRE_StructVectorInitialize(b);
+               AddValuesVector(readgrid,b,readperiodic,1.0);
+               HYPRE_StructVectorAssemble(b);
+            }
+
+            if ((read_rhsfromfile_param == 0) && (read_x0fromfile_param == 0))
+            {                    
+               /* construct x0 , construct b*/
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &b);
+               HYPRE_StructVectorInitialize(b);
+               AddValuesVector(readgrid,b,readperiodic,1.0);
+               HYPRE_StructVectorAssemble(b);
+
+
+               HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, readgrid, &x);
+               HYPRE_StructVectorInitialize(x);
+               AddValuesVector(readgrid,x,periodx0,0.0);
+               HYPRE_StructVectorAssemble(x); 
+            }   
+	   }
+	   /* finish the read of matrix  */
+   }
+   /* finish the sum > 0 case   */
+   
+   /*-----------------------------------------------------------
+	* Print out the system and initial guess
+	*-----------------------------------------------------------*/
+   
+   if (print_system)
+   {
+	   HYPRE_StructMatrixPrint("struct.out.A", A, 0);
+	   HYPRE_StructVectorPrint("struct.out.b", b, 0);
+	   HYPRE_StructVectorPrint("struct.out.x0", x, 0);
+   }
+   
+   /*-----------------------------------------------------------
+	* axpy
+	*-----------------------------------------------------------*/
+   
+#if !HYPRE_MFLOPS
+
+   hypre_MPI_Barrier(hypre_MPI_COMM_WORLD);
+   
+   if (solver_id == 0)
+   {
+     //timeval tstart,tstop;
+	   time_index = hypre_InitializeTiming("axpy");
+	   hypre_BeginTiming(time_index);
+	   //gettimeofday(&tstart,NULL);
+
+	   for ( rep=0; rep<reps; ++rep )
+	   {
+		   hypre_StructAxpy(2.0, b, x);
+	   }
+	   //gettimeofday(&tstop,NULL);
+	   hypre_EndTiming(time_index);
+
+	   //HYPRE_Real telapsed = (tstop.tv_sec - tstart.tv_sec) + (tstop.tv_usec - tstart.tv_usec)/1e6 ;
+	   //hypre_printf("axpy, \t %d, \t %f\n",nx*ny*nz,telapsed/reps);
+	   
+	   hypre_PrintTiming("Time for axpy", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+   }
+   
+   /*-----------------------------------------------------------
+	* sparse matrix vector multiplication
+	*-----------------------------------------------------------*/
+   
+   else if ( solver_id == 1 || solver_id == 3 || solver_id == 4 )
+   {
+     //timeval tstart,tstop;
+	   void *matvec_data;
+
+	   matvec_data = hypre_StructMatvecCreate();
+	   hypre_StructMatvecSetup(matvec_data, A, x);
+	   
+	   time_index = hypre_InitializeTiming("Mat-Vec");
+	   hypre_BeginTiming(time_index);
+	   //gettimeofday(&tstart,NULL);
+
+	   for ( rep=0; rep<reps; ++rep )
+	   {
+		   hypre_StructMatvecCompute(matvec_data, 1.0, A, x, 1.0, b);
+	   }
+	   //gettimeofday(&tstop,NULL);
+	   hypre_EndTiming(time_index);
+
+	   //HYPRE_Real telapsed = (tstop.tv_sec - tstart.tv_sec) + (tstop.tv_usec - tstart.tv_usec)/1e6 ;
+	   //hypre_printf("Mat-Vec, \t %d, \t %f\n",nx*ny*nz,telapsed/reps);
+
+	   hypre_PrintTiming("Time for Mat-Vec", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+   }
+
+   /*-----------------------------------------------------------
+	* inner product
+	*-----------------------------------------------------------*/
+
+   else if (solver_id == 2)
+   {
+     //timeval tstart,tstop;
+	   time_index = hypre_InitializeTiming("inner");
+	   hypre_BeginTiming(time_index);
+	   //gettimeofday(&tstart,NULL);
+	   for ( rep=0; rep<reps; ++rep )
+	   {
+		   inner = hypre_StructInnerProd(x,b);
+	   }
+	   //gettimeofday(&tstop,NULL);
+	   hypre_EndTiming(time_index);
+	   //HYPRE_Real telapsed = (tstop.tv_sec - tstart.tv_sec) + (tstop.tv_usec - tstart.tv_usec)/1e6 ;
+	   //hypre_printf("inner, \t %d, \t %f\n",nx*ny*nz,telapsed/reps);
+
+	   hypre_PrintTiming("Time for inner product", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+   }
+
+   /*-----------------------------------------------------------
+	* Solve the system using Jacobi
+	*-----------------------------------------------------------*/
+   
+   else if ( solver_id == 8 )
+   {
+	   time_index = hypre_InitializeTiming("Jacobi Setup");
+	   hypre_BeginTiming(time_index);
+	   
+	   HYPRE_StructJacobiCreate(hypre_MPI_COMM_WORLD, &solver);
+	   HYPRE_StructJacobiSetMaxIter(solver, 100);
+	   HYPRE_StructJacobiSetTol(solver, 1.0e-06);
+	   HYPRE_StructJacobiSetup(solver, A, b, x);
+	   
+	   hypre_EndTiming(time_index);
+	   hypre_PrintTiming("Setup phase times", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+	   
+	   time_index = hypre_InitializeTiming("Jacobi Solve");
+	   hypre_BeginTiming(time_index);
+	   
+	   HYPRE_StructJacobiSolve(solver, A, b, x);
+	   
+	   hypre_EndTiming(time_index);
+	   hypre_PrintTiming("Solve phase times", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+	   
+	   HYPRE_StructJacobiGetNumIterations(solver, &num_iterations);
+	   HYPRE_StructJacobiGetFinalRelativeResidualNorm(solver, &final_res_norm);
+	   HYPRE_StructJacobiDestroy(solver);
+   }
+
+   /*-----------------------------------------------------------
+	* Solve the system using CG
+	*-----------------------------------------------------------*/
+   
+   if ((solver_id > 9) && (solver_id < 20))
+   {
+   }
+
+   /*-----------------------------------------------------------
+	* Print the solution and other info
+	*-----------------------------------------------------------*/
+
+   if (print_system)
+   {
+	   HYPRE_StructVectorPrint("struct.out.x", x, 0);
+   }
+#endif
+   /*-----------------------------------------------------------
+	* Compute MFLOPs for Matvec
+	*-----------------------------------------------------------*/
+
+#if HYPRE_MFLOPS
+   {
+	   void *matvec_data;
+	   HYPRE_Int   i, imax, N;
+	   
+	   /* compute imax */
+	   N = (P*nx)*(Q*ny)*(R*nz);
+	   imax = (5*1000000) / N;
+	   
+	   matvec_data = hypre_StructMatvecCreate();
+	   hypre_StructMatvecSetup(matvec_data, A, x);
+	   
+	   time_index = hypre_InitializeTiming("Matvec");
+	   hypre_BeginTiming(time_index);
+	   
+	   for (i = 0; i < imax; i++)
+	   {
+		   hypre_StructMatvecCompute(matvec_data, 1.0, A, x, 1.0, b);
+	   }
+	   /* this counts mult-adds */
+	   hypre_IncFLOPCount(7*N*imax);
+	   
+	   hypre_EndTiming(time_index);
+	   hypre_PrintTiming("Matvec time", hypre_MPI_COMM_WORLD);
+	   hypre_FinalizeTiming(time_index);
+	   hypre_ClearTiming();
+	   
+	   hypre_StructMatvecDestroy(matvec_data);
+   }
+#endif
+   
+   /*-----------------------------------------------------------
+	* Finalize things
+	*-----------------------------------------------------------*/
+   
+   HYPRE_StructStencilDestroy(stencil);
+   HYPRE_StructMatrixDestroy(A);
+   HYPRE_StructVectorDestroy(b);
+   HYPRE_StructVectorDestroy(x);
+   
+   for ( i = 0; i < (dim + 1); i++)
+	   hypre_TFree(offsets[i]);
+   hypre_TFree(offsets);
+   
+   hypre_FinalizeMemoryDebug();
+   
+   /* Finalize MPI */
+   hypre_MPI_Finalize();
+#if defined(HYPRE_USE_KOKKOS)
+   Kokkos::finalize ();
+#endif
+   return (0);
+}
+
+/*-------------------------------------------------------------------------
+ * add constant values to a vector. Need to pass the initialized vector, grid,
+ * period of grid and the constant value.
+ *-------------------------------------------------------------------------*/
+
+HYPRE_Int
+AddValuesVector( hypre_StructGrid  *gridvector,
+                 hypre_StructVector *zvector,
+                 HYPRE_Int          *period, 
+                 HYPRE_Real         value  )
+{
+/* #include  "_hypre_struct_mv.h" */
+   HYPRE_Int ierr = 0;
+   hypre_BoxArray     *gridboxes;
+   HYPRE_Int          i,ib;
+   hypre_IndexRef     ilower;
+   hypre_IndexRef     iupper;
+   hypre_Box          *box;
+   HYPRE_Real         *values;
+   HYPRE_Int          volume,dim;
+
+   gridboxes =  hypre_StructGridBoxes(gridvector);
+   dim       =  hypre_StructGridNDim(gridvector);
+
+   ib=0;
+   hypre_ForBoxI(ib, gridboxes)
+   {
+      box      = hypre_BoxArrayBox(gridboxes, ib);
+      volume   =  hypre_BoxVolume(box);
+      values   = hypre_CTAlloc(HYPRE_Real, volume);
+
+      /*-----------------------------------------------------------
+       * For periodic b.c. in all directions, need rhs to satisfy 
+       * compatibility condition. Achieved by setting a source and
+       *  sink of equal strength.  All other problems have rhs = 1.
+       *-----------------------------------------------------------*/
+
+      if ((dim == 2 && period[0] != 0 && period[1] != 0) ||
+          (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0))
+      {
+         for (i = 0; i < volume; i++)
+         {
+            values[i] = 0.0;
+         }
+         values[0]         =  value;
+         values[volume - 1] = -value;
+      }
+      else
+      {
+         for (i = 0; i < volume; i++)
+         {
+            values[i] = value;
+         }
+      }
+
+      ilower = hypre_BoxIMin(box);
+      iupper = hypre_BoxIMax(box);
+      HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values);
+      hypre_TFree(values);
+
+   }
+
+   return ierr;
+}
+
+/******************************************************************************
+ * Adds values to matrix based on a 7 point (3d) 
+ * symmetric stencil for a convection-diffusion problem.
+ * It need an initialized matrix, an assembled grid, and the constants
+ * that determine the 7 point (3d) convection-diffusion.
+ ******************************************************************************/
+
+HYPRE_Int
+AddValuesMatrix(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,
+                HYPRE_Real        cx,
+                HYPRE_Real        cy,
+                HYPRE_Real        cz,
+                HYPRE_Real        conx,
+                HYPRE_Real        cony,
+                HYPRE_Real        conz)
+{
+
+   HYPRE_Int ierr=0;
+   hypre_BoxArray     *gridboxes;
+   HYPRE_Int           i,s,bi;
+   hypre_IndexRef      ilower;
+   hypre_IndexRef      iupper;
+   hypre_Box          *box;
+   HYPRE_Real         *values;
+   HYPRE_Real          east,west;
+   HYPRE_Real          north,south;
+   HYPRE_Real          top,bottom;
+   HYPRE_Real          center;
+   HYPRE_Int           volume,dim,sym;
+   HYPRE_Int          *stencil_indices;
+   HYPRE_Int           stencil_size;
+   HYPRE_Int           constant_coefficient;
+
+   gridboxes =  hypre_StructGridBoxes(gridmatrix);
+   dim       =  hypre_StructGridNDim(gridmatrix);
+   sym       =  hypre_StructMatrixSymmetric(A);
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(A);
+
+   bi=0;
+
+   east = -cx;
+   west = -cx;
+   north = -cy;
+   south = -cy;
+   top = -cz;
+   bottom = -cz;
+   center = 2.0*cx;
+   if (dim > 1) center += 2.0*cy;
+   if (dim > 2) center += 2.0*cz;
+
+   stencil_size = 1 + (2 - sym) * dim;
+   stencil_indices = hypre_CTAlloc(HYPRE_Int, stencil_size);
+   for (s = 0; s < stencil_size; s++)
+   {
+      stencil_indices[s] = s;
+   }
+
+   if(sym)
+   {
+      if ( constant_coefficient==0 )
+      {
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
+
+            for (i = 0; i < stencil_size*volume; i += stencil_size)
+            {
+               switch (dim)
+               {
+                  case 1:
+                     values[i  ] = west;
+                     values[i+1] = center;
+                     break;
+                  case 2:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = center;
+                     break;
+                  case 3:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = bottom;
+                     values[i+3] = center;
+                     break;
+               }
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
+                                           stencil_indices, values);
+            hypre_TFree(values);
+         }
+      }
+      else if ( constant_coefficient==1 )
+      {
+         values   = hypre_CTAlloc(HYPRE_Real, stencil_size);
+         switch (dim)
+         {
+            case 1:
+               values[0] = west;
+               values[1] = center;
+               break;
+            case 2:
+               values[0] = west;
+               values[1] = south;
+               values[2] = center;
+               break;
+            case 3:
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               values[3] = center;
+               break;
+         }
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size,
+                                                stencil_indices, values);
+         }
+         hypre_TFree(values);
+      }
+      else
+      {
+         hypre_assert( constant_coefficient==2 );
+
+         /* stencil index for the center equals dim, so it's easy to leave out */
+         values   = hypre_CTAlloc(HYPRE_Real, stencil_size-1);
+         switch (dim)
+         {
+            case 1:
+               values[0] = west;
+               break;
+            case 2:
+               values[0] = west;
+               values[1] = south;
+               break;
+            case 3:
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               break;
+         }
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size-1,
+                                                stencil_indices, values);
+         }
+         hypre_TFree(values);
+
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, volume);
+
+            for ( i=0; i < volume; ++i )
+            {
+               values[i] = center;
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
+                                           stencil_indices+dim, values);
+            hypre_TFree(values);
+         }
+      }
+   }
+   else
+   {
+      if (conx > 0.0)
+      {
+         west   -= conx;
+         center += conx;
+      }
+      else if (conx < 0.0) 
+      {
+         east   += conx;
+         center -= conx;
+      }
+      if (cony > 0.0)
+      {
+         south  -= cony;
+         center += cony;
+      }
+      else if (cony < 0.0) 
+      {
+         north  += cony;
+         center -= cony;
+      }
+      if (conz > 0.0)
+      {
+         bottom -= conz;
+         center += conz;
+      }
+      else if (cony < 0.0) 
+      {
+         top    += conz;
+         center -= conz;
+      }
+
+      if ( constant_coefficient==0 )
+      {
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, stencil_size*volume);
+
+            for (i = 0; i < stencil_size*volume; i += stencil_size)
+            {
+               switch (dim)
+               {
+                  case 1:
+                     values[i  ] = west;
+                     values[i+1] = center;
+                     values[i+2] = east;
+                     break;
+                  case 2:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = center;
+                     values[i+3] = east;
+                     values[i+4] = north;
+                     break;
+                  case 3:
+                     values[i  ] = west;
+                     values[i+1] = south;
+                     values[i+2] = bottom;
+                     values[i+3] = center;
+                     values[i+4] = east;
+                     values[i+5] = north;
+                     values[i+6] = top;
+                     break;
+               }
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, stencil_size,
+                                           stencil_indices, values);
+
+            hypre_TFree(values);
+         }
+      }
+      else if ( constant_coefficient==1 )
+      {
+         values = hypre_CTAlloc( HYPRE_Real, stencil_size );
+
+         switch (dim)
+         {
+            case 1:
+               values[0] = west;
+               values[1] = center;
+               values[2] = east;
+               break;
+            case 2:
+               values[0] = west;
+               values[1] = south;
+               values[2] = center;
+               values[3] = east;
+               values[4] = north;
+               break;
+            case 3:
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               values[3] = center;
+               values[4] = east;
+               values[5] = north;
+               values[6] = top;
+               break;
+         }
+
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size,
+                                                stencil_indices, values);
+         }
+
+         hypre_TFree(values);
+      }
+      else
+      {
+         hypre_assert( constant_coefficient==2 );
+         values = hypre_CTAlloc( HYPRE_Real, stencil_size-1 );
+         switch (dim)
+         {  /* no center in stencil_indices and values */
+            case 1:
+               stencil_indices[0] = 0;
+               stencil_indices[1] = 2;
+               values[0] = west;
+               values[1] = east;
+               break;
+            case 2:
+               stencil_indices[0] = 0;
+               stencil_indices[1] = 1;
+               stencil_indices[2] = 3;
+               stencil_indices[3] = 4;
+               values[0] = west;
+               values[1] = south;
+               values[2] = east;
+               values[3] = north;
+               break;
+            case 3:
+               stencil_indices[0] = 0;
+               stencil_indices[1] = 1;
+               stencil_indices[2] = 2;
+               stencil_indices[3] = 4;
+               stencil_indices[4] = 5;
+               stencil_indices[5] = 6;
+               values[0] = west;
+               values[1] = south;
+               values[2] = bottom;
+               values[3] = east;
+               values[4] = north;
+               values[5] = top;
+               break;
+         }
+
+         if (hypre_BoxArraySize(gridboxes) > 0)
+         {
+            HYPRE_StructMatrixSetConstantValues(A, stencil_size,
+                                                stencil_indices, values);
+         }
+         hypre_TFree(values);
+
+
+         /* center is variable */
+         stencil_indices[0] = dim; /* refers to center */
+         hypre_ForBoxI(bi, gridboxes)
+         {
+            box      = hypre_BoxArrayBox(gridboxes, bi);
+            volume   =  hypre_BoxVolume(box);
+            values   = hypre_CTAlloc(HYPRE_Real, volume);
+
+            for ( i=0; i < volume; ++i )
+            {
+               values[i] = center;
+            }
+            ilower = hypre_BoxIMin(box);
+            iupper = hypre_BoxIMax(box);
+            HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1,
+                                           stencil_indices, values);
+            hypre_TFree(values);
+         }
+      }
+   }
+
+   hypre_TFree(stencil_indices);
+
+   return ierr;
+}
+
+/*********************************************************************************
+ * this function sets to zero the stencil entries that are on the boundary
+ * Grid, matrix and the period are needed. 
+ *********************************************************************************/ 
+
+HYPRE_Int
+SetStencilBndry(HYPRE_StructMatrix A,HYPRE_StructGrid gridmatrix,HYPRE_Int* period)
+{
+
+   HYPRE_Int ierr=0;
+   hypre_BoxArray    *gridboxes;
+   HYPRE_Int          size,i,j,d,ib;
+   HYPRE_Int        **ilower;
+   HYPRE_Int        **iupper;
+   HYPRE_Int         *vol;
+   HYPRE_Int         *istart, *iend;
+   hypre_Box         *box;
+   hypre_Box         *dummybox;
+   hypre_Box         *boundingbox;
+   HYPRE_Real        *values;
+   HYPRE_Int          volume, dim;
+   HYPRE_Int         *stencil_indices;
+   HYPRE_Int          constant_coefficient;
+
+   gridboxes       = hypre_StructGridBoxes(gridmatrix);
+   boundingbox     = hypre_StructGridBoundingBox(gridmatrix);
+   istart          = hypre_BoxIMin(boundingbox);
+   iend            = hypre_BoxIMax(boundingbox);
+   size            = hypre_StructGridNumBoxes(gridmatrix);
+   dim             = hypre_StructGridNDim(gridmatrix);
+   stencil_indices = hypre_CTAlloc(HYPRE_Int, 1);
+
+   constant_coefficient = hypre_StructMatrixConstantCoefficient(A);
+   if ( constant_coefficient>0 ) return 1;
+   /*...no space dependence if constant_coefficient==1,
+     and space dependence only for diagonal if constant_coefficient==2 --
+     and this function only touches off-diagonal entries */
+
+   vol    = hypre_CTAlloc(HYPRE_Int, size);
+   ilower = hypre_CTAlloc(HYPRE_Int*, size);
+   iupper = hypre_CTAlloc(HYPRE_Int*, size);
+   for (i = 0; i < size; i++)
+   {
+      ilower[i] = hypre_CTAlloc(HYPRE_Int, dim);
+      iupper[i] = hypre_CTAlloc(HYPRE_Int, dim);
+   }
+
+   i = 0;
+   ib = 0;
+   hypre_ForBoxI(i, gridboxes)
+   {
+      dummybox = hypre_BoxCreate(dim);
+      box      = hypre_BoxArrayBox(gridboxes, i);
+      volume   =  hypre_BoxVolume(box);
+      vol[i]   = volume;
+      hypre_CopyBox(box,dummybox);
+      for (d = 0; d < dim; d++)
+      {
+         ilower[ib][d] = hypre_BoxIMinD(dummybox,d);
+         iupper[ib][d] = hypre_BoxIMaxD(dummybox,d);
+      }
+      ib++ ;
+      hypre_BoxDestroy(dummybox);
+   }
+
+   if ( constant_coefficient==0 )
+   {
+      for (d = 0; d < dim; d++)
+      {
+         for (ib = 0; ib < size; ib++)
+         {
+            values = hypre_CTAlloc(HYPRE_Real, vol[ib]);
+        
+            for (i = 0; i < vol[ib]; i++)
+            {
+               values[i] = 0.0;
+            }
+
+            if( ilower[ib][d] == istart[d] && period[d] == 0 )
+            {
+               j = iupper[ib][d];
+               iupper[ib][d] = istart[d];
+               stencil_indices[0] = d;
+               HYPRE_StructMatrixSetBoxValues(A, ilower[ib], iupper[ib],
+                                              1, stencil_indices, values);
+               iupper[ib][d] = j;
+            }
+
+            if( iupper[ib][d] == iend[d] && period[d] == 0 )
+            {
+               j = ilower[ib][d];
+               ilower[ib][d] = iend[d];
+               stencil_indices[0] = dim + 1 + d;
+               HYPRE_StructMatrixSetBoxValues(A, ilower[ib], iupper[ib],
+                                              1, stencil_indices, values);
+               ilower[ib][d] = j;
+            }
+            hypre_TFree(values);
+         }
+      }
+   }
+  
+   hypre_TFree(vol);
+   hypre_TFree(stencil_indices);
+   for (ib =0 ; ib < size ; ib++)
+   {
+      hypre_TFree(ilower[ib]);
+      hypre_TFree(iupper[ib]);
+   }
+   hypre_TFree(ilower);
+   hypre_TFree(iupper);
+
+   return ierr;
+}
diff --git a/src/test/zboxloop.c b/src/test/zboxloop.c
index 2a6032f..04ddcb6 100644
--- a/src/test/zboxloop.c
+++ b/src/test/zboxloop.c
@@ -38,7 +38,8 @@ main( hypre_int argc,
    HYPRE_Int         rep, reps, fail, sum;
    HYPRE_Int         size;
    hypre_Box        *x1_data_box, *x2_data_box, *x3_data_box, *x4_data_box;
-   HYPRE_Int         xi1, xi2, xi3, xi4;
+   //HYPRE_Int         xi1, xi2, xi3, xi4;
+   HYPRE_Int         xi1;
    HYPRE_Real       *xp1, *xp2, *xp3, *xp4;
    hypre_Index       loop_size, start, unit_stride, index;
    
@@ -192,7 +193,7 @@ main( hypre_int argc,
    zypre_BoxLoop1Begin(dim, loop_size,
                        x1_data_box, start, unit_stride, xi1);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
    zypre_BoxLoop1For(xi1)
    {
@@ -203,9 +204,8 @@ main( hypre_int argc,
    /* Use old boxloop to check that values are set to 1 */
    fail = 0;
    sum = 0;
-   hypre_BoxLoop1Begin(3, loop_size,
-                       x1_data_box, start, unit_stride, xi1);
-   hypre_BoxLoop1For(xi1)
+   hypre_SerialBoxLoop1Begin(3, loop_size,
+			     x1_data_box, start, unit_stride, xi1);
    {
       sum += xp1[xi1];
       if (xp1[xi1] != 1)
@@ -216,7 +216,7 @@ main( hypre_int argc,
          fail = 1;
       }
    }
-   hypre_BoxLoop1End(xi1);
+   hypre_SerialBoxLoop1End(xi1);
 
    if (sum != (nx*ny*nz))
    {
@@ -251,7 +251,7 @@ main( hypre_int argc,
       hypre_BoxLoop0For()
       {
          xp1[xi1] += xp1[xi1];
-         xi1++;
+         //xi1++;
       }
       hypre_BoxLoop0End();
    }
@@ -265,7 +265,7 @@ main( hypre_int argc,
       hypre_BoxLoop1Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop1For(xi1)
       {
@@ -284,7 +284,7 @@ main( hypre_int argc,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop2For(xi1, xi2)
       {
@@ -304,7 +304,7 @@ main( hypre_int argc,
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2,xi3) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop3For(xi1, xi2, xi3)
       {
@@ -325,7 +325,7 @@ main( hypre_int argc,
                           x3_data_box, start, unit_stride, xi3,
                           x4_data_box, start, unit_stride, xi4);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(HYPRE_BOX_PRIVATE,xi1,xi2,xi3,xi4) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       hypre_BoxLoop4For(xi1, xi2, xi3, xi4)
       {
@@ -370,7 +370,7 @@ main( hypre_int argc,
       zypre_BoxLoop1Begin(dim, loop_size,
                           x1_data_box, start, unit_stride, xi1);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop1For(xi1)
       {
@@ -389,7 +389,7 @@ main( hypre_int argc,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop2For(xi1, xi2)
       {
@@ -409,7 +409,7 @@ main( hypre_int argc,
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2,xi3) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop3For(xi1, xi2, xi3)
       {
@@ -430,7 +430,7 @@ main( hypre_int argc,
                           x3_data_box, start, unit_stride, xi3,
                           x4_data_box, start, unit_stride, xi4);
 #ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for private(ZYPRE_BOX_PRIVATE,xi1,xi2,xi3,xi4) HYPRE_SMP_SCHEDULE
+#pragma omp parallel for private(ZYPRE_BOX_PRIVATE) HYPRE_SMP_SCHEDULE
 #endif
       zypre_BoxLoop4For(xi1, xi2, xi3, xi4)
       {
diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h
index 35bbcdf..ccbd9a0 100644
--- a/src/utilities/HYPRE_utilities.h
+++ b/src/utilities/HYPRE_utilities.h
@@ -43,25 +43,51 @@ extern "C" {
  * Big int stuff
  *--------------------------------------------------------------------------*/
 
-#ifdef HYPRE_BIGINT
+#if defined(HYPRE_BIGINT)
 typedef long long int HYPRE_Int;
 #define HYPRE_MPI_INT MPI_LONG_LONG_INT
-#else 
+
+#else /* default */
 typedef int HYPRE_Int;
 #define HYPRE_MPI_INT MPI_INT
 #endif
 
 /*--------------------------------------------------------------------------
- * Complex stuff
+ * Real and Complex types
  *--------------------------------------------------------------------------*/
 
+#include <float.h>
+
+#if defined(HYPRE_SINGLE)
+typedef float HYPRE_Real;
+#define HYPRE_REAL_MAX FLT_MAX
+#define HYPRE_REAL_MIN FLT_MIN
+#define HYPRE_REAL_EPSILON FLT_EPSILON
+#define HYPRE_REAL_MIN_EXP FLT_MIN_EXP
+#define HYPRE_MPI_REAL MPI_FLOAT
+
+#elif defined(HYPRE_LONG_DOUBLE)
+typedef long double HYPRE_Real;
+#define HYPRE_REAL_MAX LDBL_MAX
+#define HYPRE_REAL_MIN LDBL_MIN
+#define HYPRE_REAL_EPSILON LDBL_EPSILON
+#define HYPRE_REAL_MIN_EXP DBL_MIN_EXP
+#define HYPRE_MPI_REAL MPI_LONG_DOUBLE
+
+#else /* default */
 typedef double HYPRE_Real;
+#define HYPRE_REAL_MAX DBL_MAX
+#define HYPRE_REAL_MIN DBL_MIN
+#define HYPRE_REAL_EPSILON DBL_EPSILON
+#define HYPRE_REAL_MIN_EXP DBL_MIN_EXP
 #define HYPRE_MPI_REAL MPI_DOUBLE
+#endif
 
-#ifdef HYPRE_COMPLEX
+#if defined(HYPRE_COMPLEX)
 typedef double _Complex HYPRE_Complex;
 #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX  /* or MPI_LONG_DOUBLE ? */
-#else 
+
+#else  /* default */
 typedef HYPRE_Real HYPRE_Complex;
 #define HYPRE_MPI_COMPLEX HYPRE_MPI_REAL
 #endif
diff --git a/src/utilities/Makefile b/src/utilities/Makefile
index d62d755..512e6d0 100644
--- a/src/utilities/Makefile
+++ b/src/utilities/Makefile
@@ -60,7 +60,9 @@ FILES =\
  F90_HYPRE_error.c\
  hypre_prefix_sum.c\
  hypre_merge_sort.c\
- hypre_hopscotch_hash.c
+ hypre_hopscotch_hash.c\
+ gpuErrorCheck.c\
+ gpuMem.c
 
 OBJS = ${FILES:.c=.o}
 
diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index c21b717..c6ccc82 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -1,14 +1,6 @@
-/*BHEADER**********************************************************************
- * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
- * Produced at the Lawrence Livermore National Laboratory.
- * This file is part of HYPRE.  See file COPYRIGHT for details.
- *
- * HYPRE is free software; you can redistribute it and/or modify it under the
- * terms of the GNU Lesser General Public License (as published by the Free
- * Software Foundation) version 2.1 dated February 1999.
- *
- * $Revision$
- ***********************************************************************EHEADER*/
+
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
 
 #ifndef hypre_UTILITIES_HEADER
 #define hypre_UTILITIES_HEADER
@@ -19,15 +11,6 @@
 #include <omp.h>
 #endif
 
-/* This allows us to consistently avoid 'int' throughout hypre */
-typedef int               hypre_int;
-typedef long int          hypre_longint;
-typedef unsigned int      hypre_uint;
-typedef unsigned long int hypre_ulongint;
-
-/* This allows us to consistently avoid 'double' throughout hypre */
-typedef double            hypre_double;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -53,6 +36,15 @@ extern "C" {
 #ifndef hypre_GENERAL_HEADER
 #define hypre_GENERAL_HEADER
 
+/* This allows us to consistently avoid 'int' throughout hypre */
+typedef int               hypre_int;
+typedef long int          hypre_longint;
+typedef unsigned int      hypre_uint;
+typedef unsigned long int hypre_ulongint;
+
+/* This allows us to consistently avoid 'double' throughout hypre */
+typedef double            hypre_double;
+
 /*--------------------------------------------------------------------------
  * Define various functions
  *--------------------------------------------------------------------------*/
@@ -130,7 +122,9 @@ extern "C" {
 
 #define MPI_BOTTOM  	    hypre_MPI_BOTTOM
 
+#define MPI_FLOAT           hypre_MPI_FLOAT
 #define MPI_DOUBLE          hypre_MPI_DOUBLE           
+#define MPI_LONG_DOUBLE     hypre_MPI_LONG_DOUBLE
 #define MPI_INT             hypre_MPI_INT              
 #define MPI_LONG_LONG_INT   hypre_MPI_INT              
 #define MPI_CHAR            hypre_MPI_CHAR             
@@ -232,13 +226,15 @@ typedef HYPRE_Int  hypre_MPI_Aint;
 
 #define  hypre_MPI_BOTTOM  0x0
 
-#define  hypre_MPI_DOUBLE 0
-#define  hypre_MPI_INT 1
-#define  hypre_MPI_CHAR 2
-#define  hypre_MPI_LONG 3
-#define  hypre_MPI_BYTE 4
-#define  hypre_MPI_REAL 5
-#define  hypre_MPI_COMPLEX 6
+#define  hypre_MPI_FLOAT 0
+#define  hypre_MPI_DOUBLE 1
+#define  hypre_MPI_LONG_DOUBLE 2
+#define  hypre_MPI_INT 3
+#define  hypre_MPI_CHAR 4
+#define  hypre_MPI_LONG 5
+#define  hypre_MPI_BYTE 6
+#define  hypre_MPI_REAL 7
+#define  hypre_MPI_COMPLEX 8
 
 #define  hypre_MPI_SUM 0
 #define  hypre_MPI_MIN 1
@@ -272,7 +268,9 @@ typedef MPI_User_function    hypre_MPI_User_function;
 #define  hypre_MPI_BOTTOM     MPI_BOTTOM
 #define  hypre_MPI_COMM_SELF  MPI_COMM_SELF
 
+#define  hypre_MPI_FLOAT   MPI_FLOAT
 #define  hypre_MPI_DOUBLE  MPI_DOUBLE
+#define  hypre_MPI_LONG_DOUBLE  MPI_LONG_DOUBLE
 /* HYPRE_MPI_INT is defined in HYPRE_utilities.h */
 #define  hypre_MPI_INT     HYPRE_MPI_INT
 #define  hypre_MPI_CHAR    MPI_CHAR
@@ -288,6 +286,7 @@ typedef MPI_User_function    hypre_MPI_User_function;
 #define  hypre_MPI_MAX MPI_MAX
 #define  hypre_MPI_LOR MPI_LOR
 #define  hypre_MPI_SUCCESS MPI_SUCCESS
+#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 
 #define  hypre_MPI_UNDEFINED       MPI_UNDEFINED   
 #define  hypre_MPI_REQUEST_NULL    MPI_REQUEST_NULL
@@ -295,7 +294,6 @@ typedef MPI_User_function    hypre_MPI_User_function;
 #define  hypre_MPI_ANY_TAG         MPI_ANY_TAG
 #define  hypre_MPI_SOURCE          MPI_SOURCE
 #define  hypre_MPI_TAG             MPI_TAG
-#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 #define  hypre_MPI_LAND            MPI_LAND
 
 #endif
@@ -415,6 +413,157 @@ HYPRE_Int hypre_MPI_Op_create( hypre_MPI_User_function *function , hypre_int com
 extern "C" {
 #endif
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED)
+#ifdef __cplusplus
+extern "C++" {
+#endif
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifdef __cplusplus
+}
+#endif
+#define HYPRE_CUDA_GLOBAL __host__ __device__
+  
+#if defined(HYPRE_MEMORY_GPU)
+#define hypre_DeviceTAlloc(type, count) \
+  ({									\
+    type * ptr;								\
+    cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+    if ( cudaerr != cudaSuccess ) {					\
+      printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+      HYPRE_Int *p = NULL; *p = 1;						\
+    }									\
+    ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                         cudaMalloc((void**)&,sizeof(type)*(count), cudaMemAttachGlobal);	\
+											 memcpy(newptr, ptr, sizeof(type)*(count)); \
+											 cudaFree(ptr);				\
+											 ptr = newptr;}
+#else
+ #define hypre_DeviceTAlloc(type, count) \
+	({																	\
+	type * ptr;															\
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);\
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);	\
+					      memcpy(newptr, ptr, sizeof(type)*(count)); \
+					      cudaFree(ptr);		\
+					      ptr = newptr;} 
+#endif
+  
+#define hypre_DeviceTFree(ptr) \
+	{											\
+		cudaError_t cudaerr = cudaFree(ptr);							\
+		if ( cudaerr != cudaSuccess ) {									\
+			printf("\n CudaFree : %s in %s(%d) function %s\n",cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+			HYPRE_Int *p = NULL; *p = 1;										\
+		}																\
+	}																	\
+	
+
+#define hypre_DataCopyToData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrD, ptrH, sizeof(type)*count, cudaMemcpyHostToDevice); \
+if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCopyToData %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+}							  \
+	}
+	
+	
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrH, ptrD, sizeof(type)*count, cudaMemcpyDeviceToHost); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	}
+
+#define hypre_DeviceMemset(ptr,value,type,count)	\
+	cudaMemset(ptr,value,count*sizeof(type));
+	
+#define hypre_UMTAlloc(type, count)				\
+  ({									\
+      type * ptr;								\
+      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+      ptr;								\
+  })
+	
+#define hypre_UMCTAlloc(type, count)					\
+  ({									\
+    type * ptr;								\
+    cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaMemset(ptr,0,sizeof(type)*(count));				\
+    ptr;})								\
+  
+  
+#define hypre_UMTReAlloc(type, count)\
+  ({							 \
+    type * ptr;								\
+    type *newptr;							\
+    cudaMallocManaged((void**)&newptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaFree(ptr);							\
+    ptr = newptr;							\
+    ptr;})								\
+  
+#define hypre_UMTFree(ptr) \
+      cudaFree(ptr)
+
+#define hypre_InitMemoryDebug(id)
+#define hypre_FinalizeMemoryDebug()
+
+#define hypre_TAlloc(type, count) \
+( (type *)hypre_MAlloc((size_t)(sizeof(type) * (count))) )
+
+#define hypre_CTAlloc(type, count) \
+( (type *)hypre_CAlloc((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_TReAlloc(ptr, type, count) \
+( (type *)hypre_ReAlloc((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_TFree(ptr) \
+( hypre_Free((char *)ptr), ptr = NULL )
+  
+  //#define hypre_TAlloc(type, count)  hypre_UMTAlloc(type, count)
+  //#define hypre_CTAlloc(type, count) hypre_UMCTAlloc(type, count)
+  //#define hypre_TReAlloc(ptr, type, count) hypre_UMTReAlloc(type, count)
+  //#define hypre_TFree(ptr) hypre_UMTFree(ptr)
+
+#define hypre_SharedTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_SharedCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_SharedTFree(ptr) hypre_TFree(ptr)
+#else
+#define HYPRE_CUDA_GLOBAL 
+
 /*--------------------------------------------------------------------------
  * Use "Debug Malloc Library", dmalloc
  *--------------------------------------------------------------------------*/
@@ -468,6 +617,34 @@ extern "C" {
 #define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
 #define hypre_SharedTFree(ptr) hypre_TFree(ptr)
 
+#define hypre_DeviceTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_DeviceCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_DeviceTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_DeviceTFree(ptr) hypre_TFree(ptr)
+#define hypre_DataCopyToData(ptrH,ptrD,type,count) memcpy(ptrD, ptrH, sizeof(type)*(count))
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count) memcpy(ptrH, ptrD, sizeof(type)*(count))
+#define hypre_DeviceMemset(ptr,value,type,count)	memset(ptr,value,count*sizeof(type))
+#define hypre_UMTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_UMCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_UMTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_UMTFree(ptr) hypre_TFree(ptr)
+#endif
+  
+#define hypre_PinnedTAlloc(type, count)\
+( (type *)hypre_MAllocPinned((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTAlloc(type, count) \
+( (type *)hypre_MAllocHost((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostCTAlloc(type, count) \
+( (type *)hypre_CAllocHost((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_HostTReAlloc(ptr, type, count) \
+( (type *)hypre_ReAllocHost((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTFree(ptr) \
+( hypre_FreeHost((char *)ptr), ptr = NULL )
+
 /*--------------------------------------------------------------------------
  * Prototypes
  *--------------------------------------------------------------------------*/
@@ -476,8 +653,13 @@ extern "C" {
 HYPRE_Int hypre_OutOfMemory ( size_t size );
 char *hypre_MAlloc ( size_t size );
 char *hypre_CAlloc ( size_t count , size_t elt_size );
+char *hypre_MAllocPinned( size_t size );
 char *hypre_ReAlloc ( char *ptr , size_t size );
 void hypre_Free ( char *ptr );
+char *hypre_CAllocHost( size_t count,size_t elt_size );
+char *hypre_MAllocHost( size_t size );
+char *hypre_ReAllocHost( char   *ptr,size_t  size );
+void hypre_FreeHost( char *ptr );
 char *hypre_SharedMAlloc ( size_t size );
 char *hypre_SharedCAlloc ( size_t count , size_t elt_size );
 char *hypre_SharedReAlloc ( char *ptr , size_t size );
@@ -577,11 +759,12 @@ HYPRE_Real time_get_cpu_seconds_( void );
 #ifndef HYPRE_TIMING
 
 #define hypre_InitializeTiming(name) 0
+#define hypre_FinalizeTiming(index)
 #define hypre_IncFLOPCount(inc)
 #define hypre_BeginTiming(i)
 #define hypre_EndTiming(i)
 #define hypre_PrintTiming(heading, comm)
-#define hypre_FinalizeTiming(index)
+#define hypre_ClearTiming()
 
 /*--------------------------------------------------------------------------
  * With timing on
@@ -747,11 +930,9 @@ typedef struct
    
 } hypre_DataExchangeResponse;
 
-
 HYPRE_Int hypre_CreateBinaryTree(HYPRE_Int, HYPRE_Int, hypre_BinaryTree*);
 HYPRE_Int hypre_DestroyBinaryTree(hypre_BinaryTree*);
 
-
 HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts, 
 		     HYPRE_Int *contact_proc_list, void *contact_send_buf, 
 		     HYPRE_Int *contact_send_buf_starts, HYPRE_Int contact_obj_size, 
@@ -760,7 +941,6 @@ HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts,
                      HYPRE_Int rnum, MPI_Comm comm,  void **p_response_recv_buf, 
                      HYPRE_Int **p_response_recv_buf_starts);
 
-
 #endif /* end of header */
 
 /*BHEADER**********************************************************************
@@ -840,9 +1020,365 @@ void hypre_error_handler(const char *filename, HYPRE_Int line, HYPRE_Int ierr, c
 
 #endif /* CALIPER_INSTRUMENTATION_HEADER */
 
-/*--------------------------------------------------------------------------
- * Other prototypes
- *--------------------------------------------------------------------------*/
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+//#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+//#define CUDAMEMATTACHTYPE cudaMemAttachHost
+#define HYPRE_GPU_USE_PINNED 1
+#define HYPRE_USE_MANAGED_SCALABLE 1
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#include "nvToolsExtCudaRt.h"
+
+static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
+static const int num_colors = sizeof(colors)/sizeof(uint32_t);
+
+#define PUSH_RANGE(name,cid) { \
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib);	\
+}
+
+#define PUSH_RANGE_PAYLOAD(name,cid,load) {		\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_INT64; \
+    eventAttrib.payload.llValue = load; \
+    eventAttrib.category=1; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \
+}
+
+#define PUSH_RANGE_DOMAIN(name,cid,dId) {				\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(getdomain(dId),&eventAttrib);	\
+}
+
+#define POP_RANGE nvtxDomainRangePop(HYPRE_DOMAIN);
+#define POP_RANGE_DOMAIN(dId) {			\
+  nvtxDomainRangePop(getdomain(dId));		\
+  }
+#else
+#define PUSH_RANGE(name,cid)
+#define POP_RANGE
+#define PUSH_RANGE_PAYLOAD(name,cid,load)
+#define PUSH_RANGE_DOMAIN(name,cid,domainName)
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef HYPRE_USE_MANAGED
+#include <cuda_runtime_api.h>
+#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+#define MEM_PAD_LEN 1
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line)
+{
+   if (code != cudaSuccess) 
+   {
+     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+     exit(2);
+   }
+}
+#define HYPRE_HOST_POINTER 0
+#define HYPRE_MANAGED_POINTER 1
+#define HYPRE_PINNED_POINTER 2
+#define HYPRE_DEVICE_POINTER 3
+#define HYPRE_UNDEFINED_POINTER1 4
+#define HYPRE_UNDEFINED_POINTER2 5
+void cudaSafeFree(void *ptr,int padding);
+hypre_int PrintPointerAttributes(const void *ptr);
+hypre_int PointerAttributes(const void *ptr);
+#endif
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __cusparseErrorCheck__
+#define __cusparseErrorCheck__
+#include <cusparse.h>
+#include <cublas_v2.h>
+#include <stdio.h>
+//#include <cuda_runtime_api.h>
+#include <stdlib.h>
+inline const char *cusparseErrorCheck(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+	    return "Unknown error in cusparseErrorCheck";
+    }
+    
+}
+inline const char *cublasErrorCheck(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+	    return "CUBLAS_STATUS_LICENSE_ERROR";
+        default:
+	    return "Unknown error in cublasErrorCheck";
+    }
+
+}
+//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+//inline void gpuAssert(cudaError_t code, const char *file, int line)
+//{
+//   if (code != cudaSuccess) 
+//   {
+//     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+//     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+//     exit(2);
+//   }
+//}
+#define cusparseErrchk(ans) { cusparseAssert((ans), __FILE__, __LINE__); }
+inline void cusparseAssert(cusparseStatus_t code, const char *file, int line)
+{
+   if (code != CUSPARSE_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUSPARSE ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUSPARSE ERROR : %s \n", cusparseErrorCheck(code));
+   }
+}
+#define cublasErrchk(ans){ cublasAssert((ans), __FILE__, __LINE__); }
+inline void cublasAssert(cublasStatus_t code, const char *file, int line)
+{
+   if (code != CUBLAS_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUBLAS ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUBLAS ERROR : %s \n", cublasErrorCheck(code));
+   }
+}
+//int PointerType(const void *ptr);
+void cudaSafeFree(void *ptr,int padding);
+//void PrintPointerAttributes(const void *ptr);
+//size_t mempush(void* ptr, size_t size,int purge);
+//int memloc(void *ptr, int device);
+#endif
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __GPUMEM_H__
+#define  __GPUMEM_H__
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+void hypre_GPUInit(hypre_int use_device);
+void hypre_GPUFinalize();
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+#endif
+
+cudaStream_t getstreamOlde(hypre_int i);
+nvtxDomainHandle_t getdomain(hypre_int i);
+cudaEvent_t getevent(hypre_int i);
+void MemAdviseReadOnly(const void *ptr, hypre_int device);
+void MemAdviseUnSetReadOnly(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocHost(const void *ptr);
+void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream);
+void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream);
+void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream);
+cublasHandle_t getCublasHandle();
+cusparseHandle_t getCusparseHandle();
+typedef struct node {
+  const void *ptr;
+  size_t size;
+  struct node *next;
+} node;
+size_t mempush(const void *ptr, size_t size, hypre_int action);
+node *memfind(node *head, const void *ptr);
+void memdel(node **head, node *found);
+void meminsert(node **head, const void *ptr,size_t size);
+void printlist(node *head,hypre_int nc);
+//#define MEM_PAD_LEN 1
+size_t memsize(const void *ptr);
+hypre_int getsetasyncmode(hypre_int mode, hypre_int action);
+void SetAsyncMode(hypre_int mode);
+hypre_int GetAsyncMode();
+void branchStream(hypre_int i, hypre_int j);
+void joinStreams(hypre_int i, hypre_int j, hypre_int k);
+void affs(hypre_int myid);
+hypre_int getcore();
+hypre_int getnuma();
+hypre_int checkDeviceProps();
+hypre_int pointerIsManaged(const void *ptr);
+/*
+ * Global struct for keeping HYPRE GPU Init state
+ */
+
+#define MAX_HGS_ELEMENTS 10
+struct hypre__global_struct{
+  hypre_int initd;
+  hypre_int device;
+  hypre_int device_count;
+  cublasHandle_t cublas_handle;
+  cusparseHandle_t cusparse_handle;
+  cusparseMatDescr_t cusparse_mat_descr;
+  cudaStream_t streams[MAX_HGS_ELEMENTS];
+  nvtxDomainHandle_t nvtx_domain;
+  hypre_int concurrent_managed_access;
+  size_t memoryHWM;
+};
+
+extern struct hypre__global_struct hypre__global_handle ;
+
+/*
+ * Macros for accessing elements of the global handle
+ */
+#define HYPRE_GPU_HANDLE hypre__global_handle.initd
+#define HYPRE_CUBLAS_HANDLE hypre__global_handle.cublas_handle
+#define HYPRE_CUSPARSE_HANDLE hypre__global_handle.cusparse_handle
+#define HYPRE_DEVICE hypre__global_handle.device
+#define HYPRE_DEVICE_COUNT hypre__global_handle.device_count
+#define HYPRE_CUSPARSE_MAT_DESCR hypre__global_handle.cusparse_mat_descr
+#define HYPRE_STREAM(index) (hypre__global_handle.streams[index])
+#define HYPRE_DOMAIN  hypre__global_handle.nvtx_domain
+#define HYPRE_GPU_CMA hypre__global_handle.concurrent_managed_access
+#define HYPRE_GPU_HWM hypre__global_handle.memoryHWM
+
+#endif
+
+#else
+
+#define hypre_GPUInit(use_device)
+#define hypre_GPUFinalize()
+
+#endif
+
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
 
 /* amg_linklist.c */
 void hypre_dispose_elt ( hypre_LinkList element_ptr );
@@ -869,21 +1405,21 @@ HYPRE_Real    hypre_cimag( HYPRE_Complex value );
 #endif
 
 /* hypre_printf.c */
-#ifdef HYPRE_BIGINT
+// #ifdef HYPRE_BIGINT
 HYPRE_Int hypre_printf( const char *format , ... );
 HYPRE_Int hypre_fprintf( FILE *stream , const char *format, ... );
 HYPRE_Int hypre_sprintf( char *s , const char *format, ... );
 HYPRE_Int hypre_scanf( const char *format , ... );
 HYPRE_Int hypre_fscanf( FILE *stream , const char *format, ... );
 HYPRE_Int hypre_sscanf( char *s , const char *format, ... );
-#else
-#define hypre_printf  printf
-#define hypre_fprintf fprintf
-#define hypre_sprintf sprintf
-#define hypre_scanf   scanf
-#define hypre_fscanf  fscanf
-#define hypre_sscanf  sscanf
-#endif
+// #else
+// #define hypre_printf  printf
+// #define hypre_fprintf fprintf
+// #define hypre_sprintf sprintf
+// #define hypre_scanf   scanf
+// #define hypre_fscanf  fscanf
+// #define hypre_sscanf  sscanf
+// #endif
 
 /* hypre_qsort.c */
 void hypre_swap ( HYPRE_Int *v , HYPRE_Int i , HYPRE_Int j );
@@ -906,8 +1442,9 @@ void hypre_qsort_abs ( HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
 HYPRE_Int hypre_DoubleQuickSplit ( HYPRE_Real *values , HYPRE_Int *indices , HYPRE_Int list_length , HYPRE_Int NumberKept );
 
 /* random.c */
-void hypre_SeedRand ( HYPRE_Int seed );
-HYPRE_Real hypre_Rand ( void );
+HYPRE_CUDA_GLOBAL void hypre_SeedRand ( HYPRE_Int seed );
+HYPRE_CUDA_GLOBAL HYPRE_Int hypre_RandI ( void );
+HYPRE_CUDA_GLOBAL HYPRE_Real hypre_Rand ( void );
 
 /* hypre_prefix_sum.c */
 /**
@@ -1057,7 +1594,5 @@ void hypre_sort_and_create_inverse_map(
 }
 #endif
 
-/*#include "hypre_hopscotch_hash.h"*/
-
 #endif
 
diff --git a/src/utilities/amg_linklist.h b/src/utilities/amg_linklist.h
index 53911fe..c326635 100644
--- a/src/utilities/amg_linklist.h
+++ b/src/utilities/amg_linklist.h
@@ -10,8 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
 /******************************************************************************
  *
  * Header file link lists
@@ -46,3 +44,4 @@ typedef hypre_ListElement  *hypre_LinkList;
 #endif
 
 #endif
+
diff --git a/src/utilities/caliper_instrumentation.h b/src/utilities/caliper_instrumentation.h
index 0f3c071..1624936 100644
--- a/src/utilities/caliper_instrumentation.h
+++ b/src/utilities/caliper_instrumentation.h
@@ -36,3 +36,4 @@
 #endif
 
 #endif /* CALIPER_INSTRUMENTATION_HEADER */
+
diff --git a/src/utilities/exchange_data.h b/src/utilities/exchange_data.h
index 918899f..82d535e 100644
--- a/src/utilities/exchange_data.h
+++ b/src/utilities/exchange_data.h
@@ -10,7 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
 #ifndef hypre_EXCHANGE_DATA_HEADER
 #define hypre_EXCHANGE_DATA_HEADER
 
@@ -19,7 +18,6 @@
 #define hypre_BinaryTreeChildIds(tree)      (tree->child_id)
 #define hypre_BinaryTreeChildId(tree, i)    (tree->child_id[i])
 
-
 typedef struct
 {
    HYPRE_Int                   parent_id;
@@ -27,8 +25,6 @@ typedef struct
    HYPRE_Int		        *child_id;
 } hypre_BinaryTree;
 
-
-
 /* In the fill_response() function the user needs to set the recv__buf
    and the response_message_size.  Memory of size send_response_storage has been
    alllocated for the send_buf (in exchange_data) - if more is needed, then
@@ -37,7 +33,6 @@ typedef struct
    If the response is an empty "confirmation" message, then set
    response_message_size =0 (and do not modify the send_buf) */
 
-
 typedef struct
 {
    HYPRE_Int    (*fill_response)(void* recv_buf, HYPRE_Int contact_size, 
@@ -51,11 +46,9 @@ typedef struct
    
 } hypre_DataExchangeResponse;
 
-
 HYPRE_Int hypre_CreateBinaryTree(HYPRE_Int, HYPRE_Int, hypre_BinaryTree*);
 HYPRE_Int hypre_DestroyBinaryTree(hypre_BinaryTree*);
 
-
 HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts, 
 		     HYPRE_Int *contact_proc_list, void *contact_send_buf, 
 		     HYPRE_Int *contact_send_buf_starts, HYPRE_Int contact_obj_size, 
@@ -64,5 +57,5 @@ HYPRE_Int hypre_DataExchangeList(HYPRE_Int num_contacts,
                      HYPRE_Int rnum, MPI_Comm comm,  void **p_response_recv_buf, 
                      HYPRE_Int **p_response_recv_buf_starts);
 
-
 #endif /* end of header */
+
diff --git a/src/utilities/general.h b/src/utilities/general.h
index 328a6c7..72455ea 100644
--- a/src/utilities/general.h
+++ b/src/utilities/general.h
@@ -10,7 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
 /******************************************************************************
  *
  * General structures and values
@@ -20,6 +19,15 @@
 #ifndef hypre_GENERAL_HEADER
 #define hypre_GENERAL_HEADER
 
+/* This allows us to consistently avoid 'int' throughout hypre */
+typedef int               hypre_int;
+typedef long int          hypre_longint;
+typedef unsigned int      hypre_uint;
+typedef unsigned long int hypre_ulongint;
+
+/* This allows us to consistently avoid 'double' throughout hypre */
+typedef double            hypre_double;
+
 /*--------------------------------------------------------------------------
  * Define various functions
  *--------------------------------------------------------------------------*/
@@ -44,3 +52,4 @@
 #endif
 
 #endif
+
diff --git a/src/utilities/threading.h b/src/utilities/gpgpu.h
similarity index 59%
copy from src/utilities/threading.h
copy to src/utilities/gpgpu.h
index ab9e478..2559bdb 100644
--- a/src/utilities/threading.h
+++ b/src/utilities/gpgpu.h
@@ -10,23 +10,10 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-#ifndef hypre_THREADING_HEADER
-#define hypre_THREADING_HEADER
-
-#ifdef HYPRE_USING_OPENMP
-
-HYPRE_Int hypre_NumThreads( void );
-HYPRE_Int hypre_NumActiveThreads( void );
-HYPRE_Int hypre_GetThreadNum( void );
-
-#else
-
-#define hypre_NumThreads() 1
-#define hypre_NumActiveThreads() 1
-#define hypre_GetThreadNum() 0
-
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+//#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+//#define CUDAMEMATTACHTYPE cudaMemAttachHost
+#define HYPRE_GPU_USE_PINNED 1
+#define HYPRE_USE_MANAGED_SCALABLE 1
 #endif
 
-void hypre_GetSimpleThreadPartition( HYPRE_Int *begin, HYPRE_Int *end, HYPRE_Int n );
-
-#endif
diff --git a/src/utilities/gpuErrorCheck.c b/src/utilities/gpuErrorCheck.c
new file mode 100644
index 0000000..3fff6cf
--- /dev/null
+++ b/src/utilities/gpuErrorCheck.c
@@ -0,0 +1,111 @@
+
+#include "_hypre_utilities.h"
+
+#if defined(HYPRE_USE_GPU) || defined(HYPRE_USE_MANAGED)
+#include <signal.h>
+#ifdef HYPRE_USE_GPU
+extern const char *cusparseErrorCheck(cusparseStatus_t error);
+extern void gpuAssert(cudaError_t code, const char *file, int line);
+extern void cusparseAssert(cusparseStatus_t code, const char *file, int line);
+#endif
+
+/*
+  cudaSafeFree frees Managed memory allocated in hypre_MAlloc,hypre_CAlloc and hypre_ReAlloc
+  It checks if the memory is managed before freeing and emits a warning if it is not memory
+  allocated using the above routines. This behaviour can be changed by defining ABORT_ON_RAW_POINTER.
+  The core file can then be used to find the location of the anomalous hypre_Free.
+ */
+void cudaSafeFree(void *ptr,int padding)
+{
+  PUSH_RANGE("SAFE_FREE",3);
+  struct cudaPointerAttributes ptr_att;
+  size_t *sptr=(size_t*)ptr-padding;
+  cudaError_t err;
+
+  err=cudaPointerGetAttributes(&ptr_att,ptr);
+  if (err!=cudaSuccess){
+    cudaGetLastError(); 
+#define FULL_WARN
+#ifndef ABORT_ON_RAW_POINTER
+#ifdef FULL_WARN
+    if (err==cudaErrorInvalidValue) fprintf(stderr,"WARNING :: Raw pointer passed to cudaSafeFree %p\n",ptr);
+    if (err==cudaErrorInvalidDevice) fprintf(stderr,"WARNING :: cudaSafeFree :: INVALID DEVICE on ptr = %p\n",ptr);
+    //PrintPointerAttributes(ptr);
+#endif
+#else
+    fprintf(stderr,"ERROR:: cudaSafeFree Aborting on raw unmanaged pointer %p\n",ptr);
+    raise(SIGABRT);
+#endif
+    free(ptr); /* Free the nonManaged pointer */
+    return;
+  }
+  if (ptr_att.isManaged){
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_MEASURE_GPU_HWM)
+    size_t mfree,mtotal;
+    gpuErrchk(cudaMemGetInfo(&mfree,&mtotal));
+    HYPRE_GPU_HWM=hypre_max((mtotal-mfree),HYPRE_GPU_HWM);
+#endif
+    /* Code below for handling managed memory pointers not allocated using hypre_CTAlloc oir hypre_TAlooc */
+    if (PointerAttributes(ptr)!=PointerAttributes(sptr)){
+      //fprintf(stderr,"ERROR IN Pointer for freeing %p %p\n",ptr,sptr);
+      gpuErrchk(cudaFree(ptr)); 
+      return;
+    }
+    gpuErrchk(cudaFree(sptr)); 
+  } else {
+    /* It is a pinned memory pointer */
+    //printf("ERROR:: NON-managed pointer passed to cudaSafeFree\n");
+    if (ptr_att.memoryType==cudaMemoryTypeHost){
+      gpuErrchk(cudaFreeHost(sptr));
+    } else if (ptr_att.memoryType==cudaMemoryTypeDevice){
+      gpuErrchk(cudaFree(sptr)); 
+    }
+  }
+  POP_RANGE;
+  return;
+}
+hypre_int PrintPointerAttributes(const void *ptr){
+  struct cudaPointerAttributes ptr_att;
+  if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess){
+    cudaGetLastError(); 
+    fprintf(stderr,"PrintPointerAttributes:: Raw pointer %p\n",ptr);
+    return HYPRE_HOST_POINTER;
+  }
+  if (ptr_att.isManaged){
+    fprintf(stderr,"PrintPointerAttributes:: Managed pointer\n");
+    fprintf(stderr,"Host address = %p, Device Address = %p\n",ptr_att.hostPointer, ptr_att.devicePointer);
+    if (ptr_att.memoryType==cudaMemoryTypeHost) fprintf(stderr,"Memory is located on host\n");
+    if (ptr_att.memoryType==cudaMemoryTypeDevice) fprintf(stderr,"Memory is located on device\n");
+    fprintf(stderr,"Device associated with this pointer is %d\n",ptr_att.device);
+    return HYPRE_MANAGED_POINTER;
+  } else {
+    fprintf(stderr,"PrintPointerAttributes:: Non-Managed & non-raw pointer\n Probably pinned host pointer\n");
+    if (ptr_att.memoryType==cudaMemoryTypeHost) {
+      fprintf(stderr,"Memory is located on host\n");
+      return HYPRE_PINNED_POINTER;
+    }
+    if (ptr_att.memoryType==cudaMemoryTypeDevice) {
+      fprintf(stderr,"Memory is located on device\n");
+      return HYPRE_DEVICE_POINTER ;
+    }
+    return HYPRE_UNDEFINED_POINTER1;
+  }
+  return HYPRE_UNDEFINED_POINTER2;
+}
+hypre_int PointerAttributes(const void *ptr){
+  struct cudaPointerAttributes ptr_att;
+  if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess){
+     cudaGetLastError(); 
+     return HYPRE_HOST_POINTER;
+  }
+  if (ptr_att.isManaged){
+    return HYPRE_MANAGED_POINTER; 
+  } else {
+    if (ptr_att.memoryType==cudaMemoryTypeHost) return HYPRE_PINNED_POINTER; /* Host pointer from cudaMallocHost */
+    if (ptr_att.memoryType==cudaMemoryTypeDevice) return HYPRE_DEVICE_POINTER ; /* cudadevice pointer */
+    return HYPRE_UNDEFINED_POINTER1; /* Shouldn't happen */
+  }
+  return HYPRE_UNDEFINED_POINTER2; /* Shouldnt happen */
+}
+
+#endif
diff --git a/src/utilities/gpuErrorCheck.h b/src/utilities/gpuErrorCheck.h
new file mode 100644
index 0000000..a721ad8
--- /dev/null
+++ b/src/utilities/gpuErrorCheck.h
@@ -0,0 +1,153 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef HYPRE_USE_MANAGED
+#include <cuda_runtime_api.h>
+#define CUDAMEMATTACHTYPE cudaMemAttachGlobal
+#define MEM_PAD_LEN 1
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line)
+{
+   if (code != cudaSuccess) 
+   {
+     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+     exit(2);
+   }
+}
+#define HYPRE_HOST_POINTER 0
+#define HYPRE_MANAGED_POINTER 1
+#define HYPRE_PINNED_POINTER 2
+#define HYPRE_DEVICE_POINTER 3
+#define HYPRE_UNDEFINED_POINTER1 4
+#define HYPRE_UNDEFINED_POINTER2 5
+void cudaSafeFree(void *ptr,int padding);
+hypre_int PrintPointerAttributes(const void *ptr);
+hypre_int PointerAttributes(const void *ptr);
+#endif
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __cusparseErrorCheck__
+#define __cusparseErrorCheck__
+#include <cusparse.h>
+#include <cublas_v2.h>
+#include <stdio.h>
+//#include <cuda_runtime_api.h>
+#include <stdlib.h>
+inline const char *cusparseErrorCheck(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+	    return "Unknown error in cusparseErrorCheck";
+    }
+    
+}
+inline const char *cublasErrorCheck(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+	    return "CUBLAS_STATUS_LICENSE_ERROR";
+        default:
+	    return "Unknown error in cublasErrorCheck";
+    }
+
+}
+//#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+//inline void gpuAssert(cudaError_t code, const char *file, int line)
+//{
+//   if (code != cudaSuccess) 
+//   {
+//     fprintf(stderr,"CUDA ERROR ( Code = %d) in line %d of file %s\n",code,line,file);
+//     fprintf(stderr,"CUDA ERROR : %s \n", cudaGetErrorString(code));
+//     exit(2);
+//   }
+//}
+#define cusparseErrchk(ans) { cusparseAssert((ans), __FILE__, __LINE__); }
+inline void cusparseAssert(cusparseStatus_t code, const char *file, int line)
+{
+   if (code != CUSPARSE_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUSPARSE ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUSPARSE ERROR : %s \n", cusparseErrorCheck(code));
+   }
+}
+#define cublasErrchk(ans){ cublasAssert((ans), __FILE__, __LINE__); }
+inline void cublasAssert(cublasStatus_t code, const char *file, int line)
+{
+   if (code != CUBLAS_STATUS_SUCCESS) 
+   {
+     fprintf(stderr,"CUBLAS ERROR  ( Code = %d) IN CUDA CALL line %d of file %s\n",code,line,file);
+     fprintf(stderr,"CUBLAS ERROR : %s \n", cublasErrorCheck(code));
+   }
+}
+//int PointerType(const void *ptr);
+void cudaSafeFree(void *ptr,int padding);
+//void PrintPointerAttributes(const void *ptr);
+//size_t mempush(void* ptr, size_t size,int purge);
+//int memloc(void *ptr, int device);
+#endif
+#endif
+
diff --git a/src/utilities/gpuMem.c b/src/utilities/gpuMem.c
new file mode 100644
index 0000000..1033f48
--- /dev/null
+++ b/src/utilities/gpuMem.c
@@ -0,0 +1,513 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "_hypre_utilities.h"
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <sched.h>
+#include <errno.h>
+hypre_int ggc(hypre_int id);
+
+/* Global struct that holds device,library handles etc */
+struct hypre__global_struct hypre__global_handle = { .initd=0, .device=0, .device_count=1,.memoryHWM=0};
+
+
+/* Initialize GPU branch of Hypre AMG */
+/* use_device =-1 */
+/* Application passes device number it is using or -1 to let Hypre decide on which device to use */
+void hypre_GPUInit(hypre_int use_device){
+  char pciBusId[80];
+  hypre_int myid;
+  hypre_int nDevices;
+  hypre_int device;
+  if (!HYPRE_GPU_HANDLE){
+    HYPRE_GPU_HANDLE=1;
+    HYPRE_DEVICE=0;
+    gpuErrchk(cudaGetDeviceCount(&nDevices));
+    HYPRE_DEVICE_COUNT=nDevices;
+    
+    if (use_device<0){
+      if (nDevices==1){
+	/* with mpibind each process will only see 1 GPU */
+	HYPRE_DEVICE=0;
+	gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+	cudaDeviceGetPCIBusId ( pciBusId, 80, HYPRE_DEVICE);
+      } else if (nDevices>1) {
+	/* No mpibind or it is a single rank run */
+	hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
+	//affs(myid);
+	MPI_Comm node_comm;
+	MPI_Info info;
+	MPI_Info_create(&info);
+	MPI_Comm_split_type(hypre_MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, myid, info, &node_comm);
+	hypre_int round_robin=1;
+	hypre_int myNodeid, NodeSize;
+	MPI_Comm_rank(node_comm, &myNodeid);
+	MPI_Comm_size(node_comm, &NodeSize);
+	if (round_robin){
+	  /* Round robin allocation of GPUs. Does not account for affinities */
+	  HYPRE_DEVICE=myNodeid%nDevices; 
+	  gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+	  cudaDeviceGetPCIBusId ( pciBusId, 80, HYPRE_DEVICE);
+	  hypre_printf("WARNING:: Code running without mpibind\n");
+	  hypre_printf("Global ID = %d , Node ID %d running on device %d of %d \n",myid,myNodeid,HYPRE_DEVICE,nDevices);
+	} else {
+	  /* Try to set the GPU based on process binding */
+	  /* works correcly for all cases */
+	  MPI_Comm numa_comm;
+	  MPI_Comm_split(node_comm,getnuma(),myNodeid,&numa_comm);
+	  hypre_int myNumaId,NumaSize;
+	  MPI_Comm_rank(numa_comm, &myNumaId);
+	  MPI_Comm_size(numa_comm, &NumaSize);
+	  hypre_int domain_devices=nDevices/2; /* Again hardwired for 2 NUMA domains */
+	  HYPRE_DEVICE = getnuma()*2+myNumaId%domain_devices;
+	  gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+	  hypre_printf("WARNING:: Code running without mpibind\n");
+	  hypre_printf("NUMA %d GID %d , NodeID %d NumaID %d running on device %d (RR=%d) of %d \n",getnuma(),myid,myNodeid,myNumaId,HYPRE_DEVICE,myNodeid%nDevices,nDevices);
+	  
+	}
+	
+	MPI_Info_free(&info);
+      } else {
+	/* No device found  */
+	hypre_fprintf(stderr,"ERROR:: NO GPUS found \n");
+	exit(2);
+      }
+    } else {
+      HYPRE_DEVICE = use_device;
+      gpuErrchk(cudaSetDevice(HYPRE_DEVICE));
+    }
+      
+      /* Create NVTX domain for all the nvtx calls in HYPRE */
+      HYPRE_DOMAIN=nvtxDomainCreateA("Hypre");
+      
+      /* Initialize streams */
+      hypre_int jj;
+      for(jj=0;jj<MAX_HGS_ELEMENTS;jj++)
+	gpuErrchk(cudaStreamCreateWithFlags(&(HYPRE_STREAM(jj)),cudaStreamNonBlocking));
+      
+      /* Initialize the library handles and streams */
+      
+    cusparseErrchk(cusparseCreate(&(HYPRE_CUSPARSE_HANDLE)));
+    cusparseErrchk(cusparseSetStream(HYPRE_CUSPARSE_HANDLE,HYPRE_STREAM(4)));
+    cusparseErrchk(cusparseCreateMatDescr(&(HYPRE_CUSPARSE_MAT_DESCR))); 
+    cusparseErrchk(cusparseSetMatType(HYPRE_CUSPARSE_MAT_DESCR,CUSPARSE_MATRIX_TYPE_GENERAL));
+    cusparseErrchk(cusparseSetMatIndexBase(HYPRE_CUSPARSE_MAT_DESCR,CUSPARSE_INDEX_BASE_ZERO));
+
+    cublasErrchk(cublasCreate(&(HYPRE_CUBLAS_HANDLE)));
+    cublasErrchk(cublasSetStream(HYPRE_CUBLAS_HANDLE,HYPRE_STREAM(4)));
+    if (!checkDeviceProps()) hypre_printf("WARNING:: Concurrent memory access not allowed\n");
+    /* Check if the arch flags used for compiling the cuda kernels match the device */
+    CudaCompileFlagCheck();
+  }
+}
+
+
+void hypre_GPUFinalize(){
+  
+  cusparseErrchk(cusparseDestroy(HYPRE_CUSPARSE_HANDLE));
+  
+  cublasErrchk(cublasDestroy(HYPRE_CUBLAS_HANDLE));
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_MEASURE_GPU_HWM)
+  hypre_printf("GPU Memory High Water Mark(per MPI_RANK) %f MB \n",(HYPRE_Real)HYPRE_GPU_HWM/1024/1024);
+#endif
+  /* Destroy streams */
+  hypre_int jj;
+  for(jj=0;jj<MAX_HGS_ELEMENTS;jj++)
+    gpuErrchk(cudaStreamDestroy(HYPRE_STREAM(jj)));
+  
+}
+
+void MemAdviseReadOnly(const void* ptr, hypre_int device){
+  if (ptr==NULL) return;
+    size_t size=mempush(ptr,0,0);
+    if (size==0) printf("WARNING:: Operations with 0 size vector \n");
+    gpuErrchk(cudaMemAdvise(ptr,size,cudaMemAdviseSetReadMostly,device));
+}
+
+void MemAdviseUnSetReadOnly(const void* ptr, hypre_int device){
+  if (ptr==NULL) return;
+    size_t size=mempush(ptr,0,0);
+    if (size==0) printf("WARNING:: Operations with 0 size vector \n");
+    gpuErrchk(cudaMemAdvise(ptr,size,cudaMemAdviseUnsetReadMostly,device));
+}
+
+
+void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device){
+  if (ptr==NULL) return;
+  gpuErrchk(cudaMemAdvise(ptr,mempush(ptr,0,0),cudaMemAdviseSetPreferredLocation,device));
+}
+
+void MemAdviseSetPrefLocHost(const void *ptr){
+  if (ptr==NULL) return;
+  gpuErrchk(cudaMemAdvise(ptr,mempush(ptr,0,0),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId));
+}
+
+
+void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream){
+  if (ptr==NULL) return;
+  size_t size;
+  size=memsize(ptr);
+  PUSH_RANGE("MemPreFetchForce",4);
+  /* Do a prefetch every time until a possible UM bug is fixed */
+  if (size>0){
+    PrintPointerAttributes(ptr);
+     gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream));
+    gpuErrchk(cudaStreamSynchronize(stream));
+    POP_RANGE;
+  return;
+  } 
+  return;
+}
+
+
+void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream){
+  if (ptr==NULL) return;
+  size_t size=memsize(ptr);
+  PUSH_RANGE_PAYLOAD("MemPreFetchForce",4,size);
+  gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream));
+  POP_RANGE;
+  return;
+}
+
+void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream){
+  if (ptr==NULL) return;
+  PUSH_RANGE_DOMAIN("MemPreFetchSized",4,0);
+  /* Do a prefetch every time until a possible UM bug is fixed */
+  if (size>0){
+    gpuErrchk(cudaMemPrefetchAsync(ptr,size,device,stream));
+    POP_RANGE_DOMAIN(0);
+    return;
+  } 
+  return;
+}
+
+
+/* Returns the same cublas handle with every call */
+cublasHandle_t getCublasHandle(){
+  cublasStatus_t stat;
+  static cublasHandle_t handle;
+  static hypre_int firstcall=1;
+  if (firstcall){
+    firstcall=0;
+    stat = cublasCreate(&handle);
+    if (stat!=CUBLAS_STATUS_SUCCESS) {
+      printf("ERROR:: CUBLAS Library initialization failed\n");
+      handle=0;
+      exit(2);
+    }
+    cublasErrchk(cublasSetStream(handle,HYPRE_STREAM(4)));
+  } else return handle;
+  return handle;
+}
+
+/* Returns the same cusparse handle with every call */
+cusparseHandle_t getCusparseHandle(){
+  cusparseStatus_t status;
+  static cusparseHandle_t handle;
+  static hypre_int firstcall=1;
+  if (firstcall){
+    firstcall=0;
+    status= cusparseCreate(&handle);
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+      printf("ERROR:: CUSPARSE Library initialization failed\n");
+      handle=0;
+      exit(2);
+    }
+    cusparseErrchk(cusparseSetStream(handle,HYPRE_STREAM(4)));
+  } else return handle;
+  return handle;
+}
+
+/* C version of mempush using linked lists */
+
+size_t mempush(const void *ptr, size_t size, hypre_int action){
+  static node* head=NULL;
+  static hypre_int nc=0;
+  node *found=NULL;
+  if (!head){
+    if ((size<=0)||(action==1)) {
+      fprintf(stderr,"mempush can start only with an insertion or a size call \n");
+      return 0;
+    }
+    head = (node*)malloc(sizeof(node));
+    head->ptr=ptr;
+    head->size=size;
+    head->next=NULL;
+    nc++;
+    return size;
+  } else {
+    // Purge an address
+    if (action==1){
+      found=memfind(head,ptr);
+      if (found){
+	memdel(&head, found);
+	nc--;
+	return 0;
+      } else {
+#ifdef FULL_WARN
+	fprintf(stderr,"ERROR :: Pointer for deletion not found in linked list %p\n",ptr);
+#endif
+	return 0;
+      }
+    } // End purge
+    
+    // Insertion
+    if (size>0){
+      found=memfind(head,ptr);
+      if (found){
+#ifdef FULL_WARN
+	fprintf(stderr,"ERROR :: Pointer for insertion already in use in linked list %p\n",ptr);
+	//printlist(head,nc);
+#endif
+	return 0;
+      } else {
+	nc++;
+	meminsert(&head,ptr,size);
+	return 0;
+      }
+    }
+
+    // Getting allocation size
+    found=memfind(head,ptr);
+    if (found){
+      return found->size;
+    } else{
+#ifdef FULL_WARN
+      fprintf(stderr,"ERROR :: Pointer for size check NOT found in linked list\n");
+#endif
+      return 0;
+    }
+  }
+}
+
+node *memfind(node *head, const void *ptr){
+  node *next;
+  next=head;
+  while(next!=NULL){
+    if (next->ptr==ptr) return next;
+    next=next->next;
+  }
+  return NULL;
+}
+
+void memdel(node **head, node *found){
+  node *next;
+  if (found==*head){
+    next=(*head)->next;
+    free(*head);
+    *head=next;
+    return;
+  }
+  next=*head;
+  while(next->next!=found){
+    next=next->next;
+  }
+  next->next=next->next->next;
+  free(found);
+  return;
+}
+void meminsert(node **head, const void  *ptr,size_t size){
+  node *nhead;
+  nhead = (node*)malloc(sizeof(node));
+  nhead->ptr=ptr;
+  nhead->size=size;
+  nhead->next=*head;
+  *head=nhead;
+  return;
+}
+
+void printlist(node *head,hypre_int nc){
+  node *next;
+  next=head;
+  printf("Node count %d \n",nc);
+  while(next!=NULL){
+    printf("Address %p of size %zu \n",next->ptr,next->size);
+    next=next->next;
+  }
+}
+
+cudaStream_t getstreamOlde(hypre_int i){
+  static hypre_int firstcall=1;
+  const hypre_int MAXSTREAMS=10;
+  static cudaStream_t s[MAXSTREAMS];
+  if (firstcall){
+    hypre_int jj;
+    for(jj=0;jj<MAXSTREAMS;jj++)
+      gpuErrchk(cudaStreamCreateWithFlags(&s[jj],cudaStreamNonBlocking));
+    //printf("Created streams ..\n");
+    firstcall=0;
+  }
+  if (i<MAXSTREAMS) return s[i];
+  fprintf(stderr,"ERROR in HYPRE_STREAM in utilities/gpuMem.c %d is greater than MAXSTREAMS = %d\n Returning default stream",i,MAXSTREAMS);
+  return 0;
+}
+
+nvtxDomainHandle_t getdomain(hypre_int i){
+    static hypre_int firstcall=1;
+    const hypre_int MAXDOMAINS=1;
+    static nvtxDomainHandle_t h[MAXDOMAINS];
+    if (firstcall){
+      h[0]= nvtxDomainCreateA("HYPRE_A");
+      firstcall=0;
+    }
+    if (i<MAXDOMAINS) return h[i];
+    fprintf(stderr,"ERROR in getdomain in utilities/gpuMem.c %d  is greater than MAXDOMAINS = %d \n Returning default domain",i,MAXDOMAINS);
+    return NULL;
+  }
+
+cudaEvent_t getevent(hypre_int i){
+  static hypre_int firstcall=1;
+  const hypre_int MAXEVENTS=10;
+  static cudaEvent_t s[MAXEVENTS];
+  if (firstcall){
+    hypre_int jj;
+    for(jj=0;jj<MAXEVENTS;jj++)
+      gpuErrchk(cudaEventCreateWithFlags(&s[jj],cudaEventDisableTiming));
+    //printf("Created events ..\n");
+    firstcall=0;
+  }
+  if (i<MAXEVENTS) return s[i];
+  fprintf(stderr,"ERROR in getevent in utilities/gpuMem.c %d is greater than MAXEVENTS = %d\n Returning default stream",i,MAXEVENTS);
+  return 0;
+}
+
+hypre_int getsetasyncmode(hypre_int mode, hypre_int action){
+  static hypre_int async_mode=0;
+  if (action==0) async_mode = mode;
+  if (action==1) return async_mode;
+  return async_mode;
+}
+
+void SetAsyncMode(hypre_int mode){
+  getsetasyncmode(mode,0);
+}
+
+hypre_int GetAsyncMode(){
+  return getsetasyncmode(0,1);
+}
+
+void branchStream(hypre_int i, hypre_int j){
+  gpuErrchk(cudaEventRecord(getevent(i),HYPRE_STREAM(i)));
+  gpuErrchk(cudaStreamWaitEvent(HYPRE_STREAM(j),getevent(i),0));
+}
+
+void joinStreams(hypre_int i, hypre_int j, hypre_int k){
+  gpuErrchk(cudaEventRecord(getevent(i),HYPRE_STREAM(i)));
+  gpuErrchk(cudaEventRecord(getevent(j),HYPRE_STREAM(j)));
+  gpuErrchk(cudaStreamWaitEvent(HYPRE_STREAM(k),getevent(i),0));
+  gpuErrchk(cudaStreamWaitEvent(HYPRE_STREAM(k),getevent(j),0));
+}
+
+void affs(hypre_int myid){
+  const hypre_int NCPUS=160;
+  cpu_set_t* mask = CPU_ALLOC(NCPUS);
+  size_t size = CPU_ALLOC_SIZE(NCPUS);
+  hypre_int cpus[NCPUS],i;
+  hypre_int retval=sched_getaffinity(0, size,mask);
+  if (!retval){
+    for(i=0;i<NCPUS;i++){
+      if (CPU_ISSET(i,mask)) 
+	cpus[i]=1; 
+      else
+	cpus[i]=0;
+    }
+    printf("Node(%d)::",myid);
+    for(i=0;i<160;i++)printf("%d",cpus[i]);
+    printf("\n");
+  } else {
+    fprintf(stderr,"sched_affinity failed\n");
+    switch(errno){
+    case EFAULT:
+      printf("INVALID MEMORY ADDRESS\n");
+      break;
+    case EINVAL:
+      printf("EINVAL:: NO VALID CPUS\n");
+      break;
+    default:
+      printf("%d something else\n",errno);
+    }
+  }
+  
+  CPU_FREE(mask);
+  
+}
+hypre_int getcore(){
+  const hypre_int NCPUS=160;
+  cpu_set_t* mask = CPU_ALLOC(NCPUS);
+  size_t size = CPU_ALLOC_SIZE(NCPUS);
+  hypre_int cpus[NCPUS],i;
+  hypre_int retval=sched_getaffinity(0, size,mask);
+  if (!retval){
+    for(i=0;i<NCPUS;i+=20){
+      if (CPU_ISSET(i,mask)) {
+	CPU_FREE(mask);
+	return i;
+      }
+    }
+  } else {
+    fprintf(stderr,"sched_affinity failed\n");
+    switch(errno){
+    case EFAULT:
+      printf("INVALID MEMORY ADDRESS\n");
+      break;
+    case EINVAL:
+      printf("EINVAL:: NO VALID CPUS\n");
+      break;
+    default:
+      printf("%d something else\n",errno);
+    }
+  }
+  return 0;
+  CPU_FREE(mask);
+  
+}
+hypre_int getnuma(){
+  const hypre_int NCPUS=160;
+  cpu_set_t* mask = CPU_ALLOC(NCPUS);
+  size_t size = CPU_ALLOC_SIZE(NCPUS);
+  hypre_int retval=sched_getaffinity(0, size,mask);
+  /* HARDWIRED FOR 2 NUMA DOMAINS */
+  if (!retval){
+    hypre_int sum0=0,i;
+    for(i=0;i<NCPUS/2;i++) 
+      if (CPU_ISSET(i,mask)) sum0++;
+    hypre_int sum1=0;
+    for(i=NCPUS/2;i<NCPUS;i++) 
+      if (CPU_ISSET(i,mask)) sum1++;
+    CPU_FREE(mask);
+    if (sum0>sum1) return 0;
+    else return 1;
+  } else {
+    fprintf(stderr,"sched_affinity failed\n");
+    switch(errno){
+    case EFAULT:
+      printf("INVALID MEMORY ADDRESS\n");
+      break;
+    case EINVAL:
+      printf("EINVAL:: NO VALID CPUS\n");
+      break;
+    default:
+      printf("%d something else\n",errno);
+    }
+  }
+  return 0;
+  CPU_FREE(mask);
+  
+}
+hypre_int checkDeviceProps(){
+  struct cudaDeviceProp prop;
+  gpuErrchk(cudaGetDeviceProperties(&prop, HYPRE_DEVICE));
+  HYPRE_GPU_CMA=prop.concurrentManagedAccess;
+  return HYPRE_GPU_CMA;
+}
+hypre_int pointerIsManaged(const void *ptr){
+  struct cudaPointerAttributes ptr_att;
+  if (cudaPointerGetAttributes(&ptr_att,ptr)!=cudaSuccess) {
+    return 0;
+  }
+  return ptr_att.isManaged;
+}
+#endif
diff --git a/src/utilities/gpuMem.h b/src/utilities/gpuMem.h
new file mode 100644
index 0000000..b4bf35a
--- /dev/null
+++ b/src/utilities/gpuMem.h
@@ -0,0 +1,104 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#if defined(HYPRE_USE_GPU) && defined(HYPRE_USE_MANAGED)
+#ifndef __GPUMEM_H__
+#define  __GPUMEM_H__
+#ifdef HYPRE_USE_GPU
+#include <cuda_runtime_api.h>
+void hypre_GPUInit(hypre_int use_device);
+void hypre_GPUFinalize();
+int VecScaleScalar(double *u, const double alpha,  int num_rows,cudaStream_t s);
+void VecCopy(double* tgt, const double* src, int size,cudaStream_t s);
+void VecSet(double* tgt, int size, double value, cudaStream_t s);
+void VecScale(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void VecScaleSplit(double *u, double *v, double *l1_norm, int num_rows,cudaStream_t s);
+void CudaCompileFlagCheck();
+#endif
+
+cudaStream_t getstreamOlde(hypre_int i);
+nvtxDomainHandle_t getdomain(hypre_int i);
+cudaEvent_t getevent(hypre_int i);
+void MemAdviseReadOnly(const void *ptr, hypre_int device);
+void MemAdviseUnSetReadOnly(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocDevice(const void *ptr, hypre_int device);
+void MemAdviseSetPrefLocHost(const void *ptr);
+void MemPrefetch(const void *ptr,hypre_int device,cudaStream_t stream);
+void MemPrefetchSized(const void *ptr,size_t size,hypre_int device,cudaStream_t stream);
+void MemPrefetchForce(const void *ptr,hypre_int device,cudaStream_t stream);
+cublasHandle_t getCublasHandle();
+cusparseHandle_t getCusparseHandle();
+typedef struct node {
+  const void *ptr;
+  size_t size;
+  struct node *next;
+} node;
+size_t mempush(const void *ptr, size_t size, hypre_int action);
+node *memfind(node *head, const void *ptr);
+void memdel(node **head, node *found);
+void meminsert(node **head, const void *ptr,size_t size);
+void printlist(node *head,hypre_int nc);
+//#define MEM_PAD_LEN 1
+size_t memsize(const void *ptr);
+hypre_int getsetasyncmode(hypre_int mode, hypre_int action);
+void SetAsyncMode(hypre_int mode);
+hypre_int GetAsyncMode();
+void branchStream(hypre_int i, hypre_int j);
+void joinStreams(hypre_int i, hypre_int j, hypre_int k);
+void affs(hypre_int myid);
+hypre_int getcore();
+hypre_int getnuma();
+hypre_int checkDeviceProps();
+hypre_int pointerIsManaged(const void *ptr);
+/*
+ * Global struct for keeping HYPRE GPU Init state
+ */
+
+#define MAX_HGS_ELEMENTS 10
+struct hypre__global_struct{
+  hypre_int initd;
+  hypre_int device;
+  hypre_int device_count;
+  cublasHandle_t cublas_handle;
+  cusparseHandle_t cusparse_handle;
+  cusparseMatDescr_t cusparse_mat_descr;
+  cudaStream_t streams[MAX_HGS_ELEMENTS];
+  nvtxDomainHandle_t nvtx_domain;
+  hypre_int concurrent_managed_access;
+  size_t memoryHWM;
+};
+
+extern struct hypre__global_struct hypre__global_handle ;
+
+/*
+ * Macros for accessing elements of the global handle
+ */
+#define HYPRE_GPU_HANDLE hypre__global_handle.initd
+#define HYPRE_CUBLAS_HANDLE hypre__global_handle.cublas_handle
+#define HYPRE_CUSPARSE_HANDLE hypre__global_handle.cusparse_handle
+#define HYPRE_DEVICE hypre__global_handle.device
+#define HYPRE_DEVICE_COUNT hypre__global_handle.device_count
+#define HYPRE_CUSPARSE_MAT_DESCR hypre__global_handle.cusparse_mat_descr
+#define HYPRE_STREAM(index) (hypre__global_handle.streams[index])
+#define HYPRE_DOMAIN  hypre__global_handle.nvtx_domain
+#define HYPRE_GPU_CMA hypre__global_handle.concurrent_managed_access
+#define HYPRE_GPU_HWM hypre__global_handle.memoryHWM
+
+#endif
+
+#else
+
+#define hypre_GPUInit(use_device)
+#define hypre_GPUFinalize()
+
+#endif
+
diff --git a/src/utilities/headers b/src/utilities/headers
index c99c0a7..4c5def0 100755
--- a/src/utilities/headers
+++ b/src/utilities/headers
@@ -19,11 +19,18 @@ INTERNAL_HEADER=_hypre_utilities.h
 
 cat > $INTERNAL_HEADER <<@
 
+/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/
+
+
 #ifndef hypre_UTILITIES_HEADER
 #define hypre_UTILITIES_HEADER
 
 #include "HYPRE_utilities.h"
 
+#ifdef HYPRE_USING_OPENMP
+#include <omp.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -38,19 +45,19 @@ cat general.h                  >> $INTERNAL_HEADER
 cat mpistubs.h                 >> $INTERNAL_HEADER
 cat hypre_smp.h                >> $INTERNAL_HEADER
 cat hypre_memory.h             >> $INTERNAL_HEADER
-cat thread_mpistubs.h          >> $INTERNAL_HEADER
 cat threading.h                >> $INTERNAL_HEADER		          
 cat timing.h                   >> $INTERNAL_HEADER
 cat amg_linklist.h             >> $INTERNAL_HEADER
 cat exchange_data.h            >> $INTERNAL_HEADER
 cat hypre_error.h              >> $INTERNAL_HEADER
-./protos amg_linklist.c        >> $INTERNAL_HEADER
-./protos binsearch.c           >> $INTERNAL_HEADER
-./protos hypre_printf.c        >> $INTERNAL_HEADER
-./protos hypre_qsort.c         >> $INTERNAL_HEADER
-./protos qsplit.c              >> $INTERNAL_HEADER
-./protos random.c              >> $INTERNAL_HEADER
-		          
+cat caliper_instrumentation.h  >> $INTERNAL_HEADER
+cat gpgpu.h                    >> $INTERNAL_HEADER
+cat hypre_nvtx.h               >> $INTERNAL_HEADER
+cat gpuErrorCheck.h            >> $INTERNAL_HEADER
+cat gpuMem.h                   >> $INTERNAL_HEADER
+
+cat protos.h                   >> $INTERNAL_HEADER
+
 #===========================================================================
 # Include guards
 #===========================================================================
diff --git a/src/utilities/hypre_error.h b/src/utilities/hypre_error.h
index 5217d67..9a21f27 100644
--- a/src/utilities/hypre_error.h
+++ b/src/utilities/hypre_error.h
@@ -10,8 +10,6 @@
  * $Revision$
  ***********************************************************************EHEADER*/
 
-
-
 #ifndef hypre_ERROR_HEADER
 #define hypre_ERROR_HEADER
 
@@ -37,3 +35,4 @@ void hypre_error_handler(const char *filename, HYPRE_Int line, HYPRE_Int ierr, c
 #endif
 
 #endif
+
diff --git a/src/utilities/hypre_memory.c b/src/utilities/hypre_memory.c
index fed7211..2406605 100644
--- a/src/utilities/hypre_memory.c
+++ b/src/utilities/hypre_memory.c
@@ -16,12 +16,14 @@
  *
  *****************************************************************************/
 
+#define HYPRE_USE_MANAGED_SCALABLE 1
 #include "_hypre_utilities.h"
-
+//#include "gpgpu.h"
+//#include "hypre_nvtx.h"
+//#include "gpuMem.h"
 #ifdef HYPRE_USE_UMALLOC
 #undef HYPRE_USE_UMALLOC
 #endif
-
 /******************************************************************************
  *
  * Standard routines
@@ -54,10 +56,23 @@ hypre_MAlloc( size_t size )
 
    if (size > 0)
    {
+     PUSH_RANGE_PAYLOAD("MALLOC",2,size);
 #ifdef HYPRE_USE_UMALLOC
       HYPRE_Int threadid = hypre_GetThreadID();
-
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
       ptr = _umalloc_(size);
+#elif HYPRE_USE_MANAGED
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+      gpuErrchk( cudaMallocManaged(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,CUDAMEMATTACHTYPE) );
+      size_t *sp=(size_t*)ptr;
+      *sp=size;
+      ptr=(void*)(&sp[MEM_PAD_LEN]);
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) );
+      mempush(ptr,size,0);
+#endif
 #else
       ptr = malloc(size);
 #endif
@@ -68,6 +83,7 @@ hypre_MAlloc( size_t size )
         hypre_OutOfMemory(size);
       }
 #endif
+      POP_RANGE;
    }
    else
    {
@@ -90,10 +106,23 @@ hypre_CAlloc( size_t count,
 
    if (size > 0)
    {
+     PUSH_RANGE_PAYLOAD("MALLOC",4,size);
 #ifdef HYPRE_USE_UMALLOC
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
       HYPRE_Int threadid = hypre_GetThreadID();
 
       ptr = _ucalloc_(count, elt_size);
+#elif HYPRE_USE_MANAGED
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+      ptr=(void*)hypre_MAlloc(size);
+      memset(ptr,0,count*elt_size);
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) );
+      memset(ptr,0,count*elt_size);
+      mempush(ptr,size,0);
+#endif
 #else
       ptr = calloc(count, elt_size);
 #endif
@@ -104,6 +133,7 @@ hypre_CAlloc( size_t count,
         hypre_OutOfMemory(size);
       }
 #endif
+      POP_RANGE;
    }
    else
    {
@@ -113,6 +143,11 @@ hypre_CAlloc( size_t count,
    return(char*) ptr;
 }
 
+#ifdef HYPRE_USE_MANAGED
+size_t memsize(const void *ptr){
+return ((size_t*)ptr)[-MEM_PAD_LEN];
+}
+#endif
 /*--------------------------------------------------------------------------
  * hypre_ReAlloc
  *--------------------------------------------------------------------------*/
@@ -135,6 +170,32 @@ hypre_ReAlloc( char   *ptr,
       HYPRE_Int threadid = hypre_GetThreadID();
       ptr = (char*)_urealloc_(ptr, size);
    }
+#elif HYPRE_USE_MANAGED
+   if (ptr == NULL)
+   {
+
+      ptr = hypre_MAlloc(size);
+   }
+   else if (size == 0)
+   {
+     hypre_Free(ptr);
+     return NULL;
+   }
+   else
+   {
+     void *nptr = hypre_MAlloc(size);
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+     size_t old_size=memsize((void*)ptr);
+#else
+     size_t old_size=mempush((void*)ptr,0,0);
+#endif
+     if (size>old_size)
+       memcpy(nptr,ptr,old_size);
+     else
+       memcpy(nptr,ptr,size);
+     hypre_Free(ptr);
+     ptr=(char*) nptr;
+   }
 #else
    if (ptr == NULL)
    {
@@ -156,6 +217,7 @@ hypre_ReAlloc( char   *ptr,
    return ptr;
 }
 
+
 /*--------------------------------------------------------------------------
  * hypre_Free
  *--------------------------------------------------------------------------*/
@@ -169,6 +231,182 @@ hypre_Free( char *ptr )
       HYPRE_Int threadid = hypre_GetThreadID();
 
       _ufree_(ptr);
+#elif HYPRE_USE_MANAGED
+      //size_t size=mempush(ptr,0,0);
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+      cudaSafeFree(ptr,MEM_PAD_LEN);
+#else
+      mempush(ptr,0,1);
+      cudaSafeFree(ptr,0);
+#endif
+      //gpuErrchk(cudaFree((void*)ptr));
+#else
+      free(ptr);
+#endif
+   }
+}
+/*--------------------------------------------------------------------------
+ * hypre_MAllocPinned
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_MAllocPinned( size_t size )
+{
+   void *ptr;
+
+   if (size > 0)
+   {
+     PUSH_RANGE_PAYLOAD("MALLOC",2,size);
+#ifdef HYPRE_USE_UMALLOC
+      HYPRE_Int threadid = hypre_GetThreadID();
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
+      ptr = _umalloc_(size);
+#elif HYPRE_USE_MANAGED
+#ifdef HYPRE_USE_MANAGED_SCALABLE
+#ifdef HYPRE_GPU_USE_PINNED
+      gpuErrchk( cudaHostAlloc(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,cudaHostAllocMapped));
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size+sizeof(size_t)*MEM_PAD_LEN,CUDAMEMATTACHTYPE) );
+#endif
+      size_t *sp=(size_t*)ptr;
+      *sp=size;
+      ptr=(void*)(&sp[MEM_PAD_LEN]);
+#else
+      gpuErrchk( cudaMallocManaged(&ptr,size,CUDAMEMATTACHTYPE) );
+      mempush(ptr,size,0);
+#endif
+#else
+      ptr = malloc(size);
+#endif
+
+#if 1
+      if (ptr == NULL)
+      {
+        hypre_OutOfMemory(size);
+      }
+#endif
+      POP_RANGE;
+   }
+   else
+   {
+      ptr = NULL;
+   }
+
+   return (char*)ptr;
+}
+/*--------------------------------------------------------------------------
+ * hypre_MAllocHost
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_MAllocHost( size_t size )
+{
+   void *ptr;
+
+   if (size > 0)
+   {
+     ptr = malloc(size);
+#if 1
+      if (ptr == NULL)
+      {
+        hypre_OutOfMemory(size);
+      }
+#endif
+      POP_RANGE;
+   }
+   else
+   {
+      ptr = NULL;
+   }
+
+   return (char*)ptr;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_CAllocHost
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_CAllocHost( size_t count,
+		  size_t elt_size )
+{
+   void   *ptr;
+   size_t  size = count*elt_size;
+
+   if (size > 0)
+   {
+     PUSH_RANGE_PAYLOAD("CAllocHost",4,size);
+#ifdef HYPRE_USE_UMALLOC
+#ifdef HYPRE_USE_MANAGED
+      printf("ERROR HYPRE_USE_UMALLOC AND HYPRE_USE_MANAGED are mutually exclusive\n");
+#endif
+      HYPRE_Int threadid = hypre_GetThreadID();
+
+ptr = _ucalloc_(count, elt_size);
+
+#else
+     ptr = calloc(count, elt_size);
+#endif
+
+#if 1
+      if (ptr == NULL)
+      {
+        hypre_OutOfMemory(size);
+      }
+#endif
+      POP_RANGE;
+   }
+   else
+   {
+      ptr = NULL;
+   }
+
+   return(char*) ptr;
+}
+/*--------------------------------------------------------------------------
+ * hypre_ReAllocHost
+ *--------------------------------------------------------------------------*/
+
+char *
+hypre_ReAllocHost( char   *ptr,
+               size_t  size )
+{
+  if (ptr == NULL)
+   {
+          ptr = (char*)malloc(size);
+   }
+   else
+   {
+
+	   ptr = (char*)realloc(ptr, size);
+   }
+
+#if 1
+   if ((ptr == NULL) && (size > 0))
+   {
+      hypre_OutOfMemory(size);
+   }
+#endif
+
+   return ptr;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_CHFree
+ *--------------------------------------------------------------------------*/
+
+void
+hypre_FreeHost( char *ptr )
+{
+   if (ptr)
+   {
+#ifdef HYPRE_USE_UMALLOC
+      HYPRE_Int threadid = hypre_GetThreadID();
+
+      _ufree_(ptr);
+
 #else
       free(ptr);
 #endif
diff --git a/src/utilities/hypre_memory.h b/src/utilities/hypre_memory.h
index 1539bdd..6716c5a 100644
--- a/src/utilities/hypre_memory.h
+++ b/src/utilities/hypre_memory.h
@@ -26,6 +26,157 @@
 extern "C" {
 #endif
 
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED)
+#ifdef __cplusplus
+extern "C++" {
+#endif
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifdef __cplusplus
+}
+#endif
+#define HYPRE_CUDA_GLOBAL __host__ __device__
+  
+#if defined(HYPRE_MEMORY_GPU)
+#define hypre_DeviceTAlloc(type, count) \
+  ({									\
+    type * ptr;								\
+    cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+    if ( cudaerr != cudaSuccess ) {					\
+      printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+      HYPRE_Int *p = NULL; *p = 1;						\
+    }									\
+    ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMalloc((void**)&ptr,sizeof(type)*(count)); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                         cudaMalloc((void**)&,sizeof(type)*(count), cudaMemAttachGlobal);	\
+											 memcpy(newptr, ptr, sizeof(type)*(count)); \
+											 cudaFree(ptr);				\
+											 ptr = newptr;}
+#else
+ #define hypre_DeviceTAlloc(type, count) \
+	({																	\
+	type * ptr;															\
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);\
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n ERROR hypre_DataTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	ptr;})
+	
+#define hypre_DeviceCTAlloc(type, count) \
+	({								   \
+	type * ptr;						   \
+	cudaError_t cudaerr = cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}		\
+	cudaMemset(ptr,0,sizeof(type)*(count));	   \
+	ptr;})									   \
+	
+#define hypre_DeviceTReAlloc(ptr, type, count) {type *newptr;				\
+	                                      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal);	\
+					      memcpy(newptr, ptr, sizeof(type)*(count)); \
+					      cudaFree(ptr);		\
+					      ptr = newptr;} 
+#endif
+  
+#define hypre_DeviceTFree(ptr) \
+	{											\
+		cudaError_t cudaerr = cudaFree(ptr);							\
+		if ( cudaerr != cudaSuccess ) {									\
+			printf("\n CudaFree : %s in %s(%d) function %s\n",cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+			HYPRE_Int *p = NULL; *p = 1;										\
+		}																\
+	}																	\
+	
+
+#define hypre_DataCopyToData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrD, ptrH, sizeof(type)*count, cudaMemcpyHostToDevice); \
+if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCopyToData %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+}							  \
+	}
+	
+	
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count)						\
+	{cudaError_t cudaerr = cudaMemcpy(ptrH, ptrD, sizeof(type)*count, cudaMemcpyDeviceToHost); \
+	if ( cudaerr != cudaSuccess ) {										\
+		printf("\n hypre_DataCTAlloc %lu : %s in %s(%d) function %s\n",sizeof(type)*(count),cudaGetErrorString(cudaerr),__FILE__,__LINE__,__FUNCTION__); \
+		HYPRE_Int *p = NULL; *p = 1;\
+	}\
+	}
+
+#define hypre_DeviceMemset(ptr,value,type,count)	\
+	cudaMemset(ptr,value,count*sizeof(type));
+	
+#define hypre_UMTAlloc(type, count)				\
+  ({									\
+      type * ptr;								\
+      cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+      ptr;								\
+  })
+	
+#define hypre_UMCTAlloc(type, count)					\
+  ({									\
+    type * ptr;								\
+    cudaMallocManaged((void**)&ptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaMemset(ptr,0,sizeof(type)*(count));				\
+    ptr;})								\
+  
+  
+#define hypre_UMTReAlloc(type, count)\
+  ({							 \
+    type * ptr;								\
+    type *newptr;							\
+    cudaMallocManaged((void**)&newptr,sizeof(type)*(count), cudaMemAttachGlobal); \
+    cudaFree(ptr);							\
+    ptr = newptr;							\
+    ptr;})								\
+  
+#define hypre_UMTFree(ptr) \
+      cudaFree(ptr)
+
+#define hypre_InitMemoryDebug(id)
+#define hypre_FinalizeMemoryDebug()
+
+#define hypre_TAlloc(type, count) \
+( (type *)hypre_MAlloc((size_t)(sizeof(type) * (count))) )
+
+#define hypre_CTAlloc(type, count) \
+( (type *)hypre_CAlloc((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_TReAlloc(ptr, type, count) \
+( (type *)hypre_ReAlloc((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_TFree(ptr) \
+( hypre_Free((char *)ptr), ptr = NULL )
+  
+  //#define hypre_TAlloc(type, count)  hypre_UMTAlloc(type, count)
+  //#define hypre_CTAlloc(type, count) hypre_UMCTAlloc(type, count)
+  //#define hypre_TReAlloc(ptr, type, count) hypre_UMTReAlloc(type, count)
+  //#define hypre_TFree(ptr) hypre_UMTFree(ptr)
+
+#define hypre_SharedTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_SharedCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_SharedTFree(ptr) hypre_TFree(ptr)
+#else
+#define HYPRE_CUDA_GLOBAL 
+
 /*--------------------------------------------------------------------------
  * Use "Debug Malloc Library", dmalloc
  *--------------------------------------------------------------------------*/
@@ -79,6 +230,34 @@ extern "C" {
 #define hypre_SharedTReAlloc(type, count) hypre_TReAlloc(type, (count))
 #define hypre_SharedTFree(ptr) hypre_TFree(ptr)
 
+#define hypre_DeviceTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_DeviceCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_DeviceTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_DeviceTFree(ptr) hypre_TFree(ptr)
+#define hypre_DataCopyToData(ptrH,ptrD,type,count) memcpy(ptrD, ptrH, sizeof(type)*(count))
+#define hypre_DataCopyFromData(ptrH,ptrD,type,count) memcpy(ptrH, ptrD, sizeof(type)*(count))
+#define hypre_DeviceMemset(ptr,value,type,count)	memset(ptr,value,count*sizeof(type))
+#define hypre_UMTAlloc(type, count) hypre_TAlloc(type, (count))
+#define hypre_UMCTAlloc(type, count) hypre_CTAlloc(type, (count))
+#define hypre_UMTReAlloc(type, count) hypre_TReAlloc(type, (count))
+#define hypre_UMTFree(ptr) hypre_TFree(ptr)
+#endif
+  
+#define hypre_PinnedTAlloc(type, count)\
+( (type *)hypre_MAllocPinned((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTAlloc(type, count) \
+( (type *)hypre_MAllocHost((size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostCTAlloc(type, count) \
+( (type *)hypre_CAllocHost((size_t)(count), (size_t)sizeof(type)) )
+
+#define hypre_HostTReAlloc(ptr, type, count) \
+( (type *)hypre_ReAllocHost((char *)ptr, (size_t)(sizeof(type) * (count))) )
+
+#define hypre_HostTFree(ptr) \
+( hypre_FreeHost((char *)ptr), ptr = NULL )
+
 /*--------------------------------------------------------------------------
  * Prototypes
  *--------------------------------------------------------------------------*/
@@ -87,8 +266,13 @@ extern "C" {
 HYPRE_Int hypre_OutOfMemory ( size_t size );
 char *hypre_MAlloc ( size_t size );
 char *hypre_CAlloc ( size_t count , size_t elt_size );
+char *hypre_MAllocPinned( size_t size );
 char *hypre_ReAlloc ( char *ptr , size_t size );
 void hypre_Free ( char *ptr );
+char *hypre_CAllocHost( size_t count,size_t elt_size );
+char *hypre_MAllocHost( size_t size );
+char *hypre_ReAllocHost( char   *ptr,size_t  size );
+void hypre_FreeHost( char *ptr );
 char *hypre_SharedMAlloc ( size_t size );
 char *hypre_SharedCAlloc ( size_t count , size_t elt_size );
 char *hypre_SharedReAlloc ( char *ptr , size_t size );
@@ -108,3 +292,4 @@ void hypre_FreeDML( char *ptr , char *file , HYPRE_Int line );
 #endif
 
 #endif
+
diff --git a/src/utilities/hypre_nvtx.h b/src/utilities/hypre_nvtx.h
new file mode 100644
index 0000000..95a3bec
--- /dev/null
+++ b/src/utilities/hypre_nvtx.h
@@ -0,0 +1,72 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#include "nvToolsExtCudaRt.h"
+
+static const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
+static const int num_colors = sizeof(colors)/sizeof(uint32_t);
+
+#define PUSH_RANGE(name,cid) { \
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib);	\
+}
+
+#define PUSH_RANGE_PAYLOAD(name,cid,load) {		\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    eventAttrib.payloadType = NVTX_PAYLOAD_TYPE_INT64; \
+    eventAttrib.payload.llValue = load; \
+    eventAttrib.category=1; \
+    nvtxDomainRangePushEx(HYPRE_DOMAIN,&eventAttrib); \
+}
+
+#define PUSH_RANGE_DOMAIN(name,cid,dId) {				\
+    int color_id = cid; \
+    color_id = color_id%num_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxDomainRangePushEx(getdomain(dId),&eventAttrib);	\
+}
+
+#define POP_RANGE nvtxDomainRangePop(HYPRE_DOMAIN);
+#define POP_RANGE_DOMAIN(dId) {			\
+  nvtxDomainRangePop(getdomain(dId));		\
+  }
+#else
+#define PUSH_RANGE(name,cid)
+#define POP_RANGE
+#define PUSH_RANGE_PAYLOAD(name,cid,load)
+#define PUSH_RANGE_DOMAIN(name,cid,domainName)
+#endif
+
diff --git a/src/utilities/hypre_printf.c b/src/utilities/hypre_printf.c
index f49a8e8..f5385af 100644
--- a/src/utilities/hypre_printf.c
+++ b/src/utilities/hypre_printf.c
@@ -14,7 +14,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-#ifdef HYPRE_BIGINT
+// #ifdef HYPRE_BIGINT
 
 /* these prototypes are missing by default for some compilers */
 int vscanf( const char *format , va_list arg );
@@ -42,32 +42,50 @@ new_format( const char *format,
       }
       else if (foundpercent)
       {
+         if (*fp == 'l')
+         {
+            fp++; /* remove 'l' and maybe add it back in switch statement */
+            if (*fp == 'l')
+            {
+               fp++; /* remove second 'l' if present */
+            }
+         }
          switch(*fp)
          {
             case 'd':
+            case 'i':
+#if defined(HYPRE_BIGINT)
                *nfp = 'l'; nfp++;
                *nfp = 'l'; nfp++;
-            case 'c':
+#endif
+               foundpercent = 0; break;
+            case 'f':
             case 'e':
             case 'E':
-            case 'f':
             case 'g':
             case 'G':
-            case 'i':
+#if defined(HYPRE_SINGLE)          /* no modifier */
+#elif defined(HYPRE_LONG_DOUBLE)   /* modify with 'L' */
+               *nfp = 'L'; nfp++;
+#else                              /* modify with 'l' (default is _double_) */
+               *nfp = 'l'; nfp++;
+#endif
+               foundpercent = 0; break;
+            case 'c':
             case 'n':
             case 'o':
             case 'p':
             case 's':
             case 'u':
             case 'x':
-            case 'S':
+            case 'X':
             case '%':
-               foundpercent = 0;
+               foundpercent = 0; break;
          }
       }
       *nfp = *fp; nfp++;
    }
-   *nfp = *fp; nfp++;
+   *nfp = *fp;
 
    *newformat_ptr = newformat;
 
@@ -79,9 +97,7 @@ new_format( const char *format,
 HYPRE_Int
 free_format( char *newformat )
 {
-#ifdef HYPRE_BIGINT
    hypre_TFree(newformat);
-#endif
 
    return 0;
 }
@@ -186,9 +202,9 @@ hypre_sscanf( char *s, const char *format, ...)
    return ierr;
 }
 
-#else
-
-/* this is used only to eliminate compiler warnings */
-HYPRE_Int hypre_printf_empty;
-
-#endif
+// #else
+// 
+// /* this is used only to eliminate compiler warnings */
+// HYPRE_Int hypre_printf_empty;
+// 
+// #endif
diff --git a/src/utilities/mpistubs.c b/src/utilities/mpistubs.c
index dbfa344..3723f93 100644
--- a/src/utilities/mpistubs.c
+++ b/src/utilities/mpistubs.c
@@ -25,7 +25,7 @@ hypre_MPI_Comm_f2c( hypre_int comm )
 #ifdef HYPRE_HAVE_MPI_COMM_F2C
    return (hypre_MPI_Comm) MPI_Comm_f2c(comm);
 #else
-   return (hypre_MPI_Comm) comm;
+   return (hypre_MPI_Comm) (size_t)comm;
 #endif
 }
 
diff --git a/src/utilities/mpistubs.h b/src/utilities/mpistubs.h
index 4c845ba..fa7a175 100644
--- a/src/utilities/mpistubs.h
+++ b/src/utilities/mpistubs.h
@@ -46,10 +46,13 @@ extern "C" {
 
 #define MPI_COMM_WORLD      hypre_MPI_COMM_WORLD       
 #define MPI_COMM_NULL       hypre_MPI_COMM_NULL
+#define MPI_COMM_SELF       hypre_MPI_COMM_SELF
 
 #define MPI_BOTTOM  	    hypre_MPI_BOTTOM
 
+#define MPI_FLOAT           hypre_MPI_FLOAT
 #define MPI_DOUBLE          hypre_MPI_DOUBLE           
+#define MPI_LONG_DOUBLE     hypre_MPI_LONG_DOUBLE
 #define MPI_INT             hypre_MPI_INT              
 #define MPI_LONG_LONG_INT   hypre_MPI_INT              
 #define MPI_CHAR            hypre_MPI_CHAR             
@@ -61,6 +64,8 @@ extern "C" {
 #define MPI_MIN             hypre_MPI_MIN              
 #define MPI_MAX             hypre_MPI_MAX              
 #define MPI_LOR             hypre_MPI_LOR              
+#define MPI_SUCCESS         hypre_MPI_SUCCESS
+#define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE
 
 #define MPI_UNDEFINED       hypre_MPI_UNDEFINED        
 #define MPI_REQUEST_NULL    hypre_MPI_REQUEST_NULL        
@@ -68,8 +73,6 @@ extern "C" {
 #define MPI_ANY_TAG         hypre_MPI_ANY_TAG
 #define MPI_SOURCE          hypre_MPI_SOURCE
 #define MPI_TAG             hypre_MPI_TAG
-#define MPI_SUCCESS         hypre_MPI_SUCCESS
-#define MPI_STATUSES_IGNORE hypre_MPI_STATUSES_IGNORE
 
 #define MPI_Init            hypre_MPI_Init             
 #define MPI_Finalize        hypre_MPI_Finalize         
@@ -93,9 +96,9 @@ extern "C" {
 #define MPI_Allgather       hypre_MPI_Allgather        
 #define MPI_Allgatherv      hypre_MPI_Allgatherv       
 #define MPI_Gather          hypre_MPI_Gather       
-#define MPI_Gatherv          hypre_MPI_Gatherv       
+#define MPI_Gatherv         hypre_MPI_Gatherv       
 #define MPI_Scatter         hypre_MPI_Scatter       
-#define MPI_Scatterv         hypre_MPI_Scatterv       
+#define MPI_Scatterv        hypre_MPI_Scatterv       
 #define MPI_Bcast           hypre_MPI_Bcast            
 #define MPI_Send            hypre_MPI_Send             
 #define MPI_Recv            hypre_MPI_Recv             
@@ -122,6 +125,9 @@ extern "C" {
 #define MPI_Type_struct     hypre_MPI_Type_struct      
 #define MPI_Type_commit     hypre_MPI_Type_commit
 #define MPI_Type_free       hypre_MPI_Type_free        
+#define MPI_Op_free         hypre_MPI_Op_free        
+#define MPI_Op_create       hypre_MPI_Op_create
+#define MPI_User_function   hypre_MPI_User_function
 
 /*--------------------------------------------------------------------------
  * Types, etc.
@@ -132,6 +138,7 @@ typedef HYPRE_Int hypre_MPI_Comm;
 typedef HYPRE_Int hypre_MPI_Group;
 typedef HYPRE_Int hypre_MPI_Request;
 typedef HYPRE_Int hypre_MPI_Datatype;
+typedef void (hypre_MPI_User_function) ();
 
 typedef struct
 {
@@ -141,18 +148,21 @@ typedef struct
 typedef HYPRE_Int  hypre_MPI_Op;
 typedef HYPRE_Int  hypre_MPI_Aint;
 
+#define  hypre_MPI_COMM_SELF 1
 #define  hypre_MPI_COMM_WORLD 0
 #define  hypre_MPI_COMM_NULL  -1
 
 #define  hypre_MPI_BOTTOM  0x0
 
-#define  hypre_MPI_DOUBLE 0
-#define  hypre_MPI_INT 1
-#define  hypre_MPI_CHAR 2
-#define  hypre_MPI_LONG 3
-#define  hypre_MPI_BYTE 4
-#define  hypre_MPI_REAL 5
-#define  hypre_MPI_COMPLEX 6
+#define  hypre_MPI_FLOAT 0
+#define  hypre_MPI_DOUBLE 1
+#define  hypre_MPI_LONG_DOUBLE 2
+#define  hypre_MPI_INT 3
+#define  hypre_MPI_CHAR 4
+#define  hypre_MPI_LONG 5
+#define  hypre_MPI_BYTE 6
+#define  hypre_MPI_REAL 7
+#define  hypre_MPI_COMPLEX 8
 
 #define  hypre_MPI_SUM 0
 #define  hypre_MPI_MIN 1
@@ -179,13 +189,16 @@ typedef MPI_Datatype hypre_MPI_Datatype;
 typedef MPI_Status   hypre_MPI_Status;
 typedef MPI_Op       hypre_MPI_Op;
 typedef MPI_Aint     hypre_MPI_Aint;
+typedef MPI_User_function    hypre_MPI_User_function;
 
 #define  hypre_MPI_COMM_WORLD MPI_COMM_WORLD
 #define  hypre_MPI_COMM_NULL  MPI_COMM_NULL
 #define  hypre_MPI_BOTTOM     MPI_BOTTOM
-#define  hypre_MPI_SUCCESS    MPI_SUCCESS
+#define  hypre_MPI_COMM_SELF  MPI_COMM_SELF
 
+#define  hypre_MPI_FLOAT   MPI_FLOAT
 #define  hypre_MPI_DOUBLE  MPI_DOUBLE
+#define  hypre_MPI_LONG_DOUBLE  MPI_LONG_DOUBLE
 /* HYPRE_MPI_INT is defined in HYPRE_utilities.h */
 #define  hypre_MPI_INT     HYPRE_MPI_INT
 #define  hypre_MPI_CHAR    MPI_CHAR
@@ -200,6 +213,8 @@ typedef MPI_Aint     hypre_MPI_Aint;
 #define  hypre_MPI_MIN MPI_MIN
 #define  hypre_MPI_MAX MPI_MAX
 #define  hypre_MPI_LOR MPI_LOR
+#define  hypre_MPI_SUCCESS MPI_SUCCESS
+#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 
 #define  hypre_MPI_UNDEFINED       MPI_UNDEFINED   
 #define  hypre_MPI_REQUEST_NULL    MPI_REQUEST_NULL
@@ -207,7 +222,6 @@ typedef MPI_Aint     hypre_MPI_Aint;
 #define  hypre_MPI_ANY_TAG         MPI_ANY_TAG
 #define  hypre_MPI_SOURCE          MPI_SOURCE
 #define  hypre_MPI_TAG             MPI_TAG
-#define  hypre_MPI_STATUSES_IGNORE MPI_STATUSES_IGNORE
 #define  hypre_MPI_LAND            MPI_LAND
 
 #endif
@@ -272,9 +286,12 @@ HYPRE_Int hypre_MPI_Type_hvector( HYPRE_Int count , HYPRE_Int blocklength , hypr
 HYPRE_Int hypre_MPI_Type_struct( HYPRE_Int count , HYPRE_Int *array_of_blocklengths , hypre_MPI_Aint *array_of_displacements , hypre_MPI_Datatype *array_of_types , hypre_MPI_Datatype *newtype );
 HYPRE_Int hypre_MPI_Type_commit( hypre_MPI_Datatype *datatype );
 HYPRE_Int hypre_MPI_Type_free( hypre_MPI_Datatype *datatype );
+HYPRE_Int hypre_MPI_Op_free( hypre_MPI_Op *op );
+HYPRE_Int hypre_MPI_Op_create( hypre_MPI_User_function *function , hypre_int commute , hypre_MPI_Op *op );
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif
+
diff --git a/src/utilities/protos.h b/src/utilities/protos.h
new file mode 100644
index 0000000..94b6c69
--- /dev/null
+++ b/src/utilities/protos.h
@@ -0,0 +1,221 @@
+/*BHEADER**********************************************************************
+ * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
+ * Produced at the Lawrence Livermore National Laboratory.
+ * This file is part of HYPRE.  See file COPYRIGHT for details.
+ *
+ * HYPRE is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License (as published by the Free
+ * Software Foundation) version 2.1 dated February 1999.
+ *
+ * $Revision$
+ ***********************************************************************EHEADER*/
+
+/* amg_linklist.c */
+void hypre_dispose_elt ( hypre_LinkList element_ptr );
+void hypre_remove_point ( hypre_LinkList *LoL_head_ptr , hypre_LinkList *LoL_tail_ptr , HYPRE_Int measure , HYPRE_Int index , HYPRE_Int *lists , HYPRE_Int *where );
+hypre_LinkList hypre_create_elt ( HYPRE_Int Item );
+void hypre_enter_on_lists ( hypre_LinkList *LoL_head_ptr , hypre_LinkList *LoL_tail_ptr , HYPRE_Int measure , HYPRE_Int index , HYPRE_Int *lists , HYPRE_Int *where );
+
+/* binsearch.c */
+HYPRE_Int hypre_BinarySearch ( HYPRE_Int *list , HYPRE_Int value , HYPRE_Int list_length );
+HYPRE_Int hypre_BinarySearch2 ( HYPRE_Int *list , HYPRE_Int value , HYPRE_Int low , HYPRE_Int high , HYPRE_Int *spot );
+HYPRE_Int *hypre_LowerBound( HYPRE_Int *first, HYPRE_Int *last, HYPRE_Int value );
+
+/* hypre_complex.c */
+#ifdef HYPRE_COMPLEX
+HYPRE_Complex hypre_conj( HYPRE_Complex value );
+HYPRE_Real    hypre_cabs( HYPRE_Complex value );
+HYPRE_Real    hypre_creal( HYPRE_Complex value );
+HYPRE_Real    hypre_cimag( HYPRE_Complex value );
+#else
+#define hypre_conj(value)  value
+#define hypre_cabs(value)  fabs(value)
+#define hypre_creal(value) value
+#define hypre_cimag(value) 0.0
+#endif
+
+/* hypre_printf.c */
+// #ifdef HYPRE_BIGINT
+HYPRE_Int hypre_printf( const char *format , ... );
+HYPRE_Int hypre_fprintf( FILE *stream , const char *format, ... );
+HYPRE_Int hypre_sprintf( char *s , const char *format, ... );
+HYPRE_Int hypre_scanf( const char *format , ... );
+HYPRE_Int hypre_fscanf( FILE *stream , const char *format, ... );
+HYPRE_Int hypre_sscanf( char *s , const char *format, ... );
+// #else
+// #define hypre_printf  printf
+// #define hypre_fprintf fprintf
+// #define hypre_sprintf sprintf
+// #define hypre_scanf   scanf
+// #define hypre_fscanf  fscanf
+// #define hypre_sscanf  sscanf
+// #endif
+
+/* hypre_qsort.c */
+void hypre_swap ( HYPRE_Int *v , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap2 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap2i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap3i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap3_d ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap4_d ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int *y , HYPRE_Int i , HYPRE_Int j );
+void hypre_swap_d ( HYPRE_Real *v , HYPRE_Int i , HYPRE_Int j );
+void hypre_qsort0 ( HYPRE_Int *v , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort1 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort2i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort2 ( HYPRE_Int *v , HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort3i ( HYPRE_Int *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort3_abs ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort4_abs ( HYPRE_Real *v , HYPRE_Int *w , HYPRE_Int *z , HYPRE_Int *y , HYPRE_Int left , HYPRE_Int right );
+void hypre_qsort_abs ( HYPRE_Real *w , HYPRE_Int left , HYPRE_Int right );
+
+/* qsplit.c */
+HYPRE_Int hypre_DoubleQuickSplit ( HYPRE_Real *values , HYPRE_Int *indices , HYPRE_Int list_length , HYPRE_Int NumberKept );
+
+/* random.c */
+HYPRE_CUDA_GLOBAL void hypre_SeedRand ( HYPRE_Int seed );
+HYPRE_CUDA_GLOBAL HYPRE_Int hypre_RandI ( void );
+HYPRE_CUDA_GLOBAL HYPRE_Real hypre_Rand ( void );
+
+/* hypre_prefix_sum.c */
+/**
+ * Assumed to be called within an omp region.
+ * Let x_i be the input of ith thread.
+ * The output of ith thread y_i = x_0 + x_1 + ... + x_{i-1}
+ * Additionally, sum = x_0 + x_1 + ... + x_{nthreads - 1}
+ * Note that always y_0 = 0
+ *
+ * @param workspace at least with length (nthreads+1)
+ *                  workspace[tid] will contain result for tid
+ *                  workspace[nthreads] will contain sum
+ */
+void hypre_prefix_sum(HYPRE_Int *in_out, HYPRE_Int *sum, HYPRE_Int *workspace);
+/**
+ * This version does prefix sum in pair.
+ * Useful when we prefix sum of diag and offd in tandem.
+ *
+ * @param worksapce at least with length 2*(nthreads+1)
+ *                  workspace[2*tid] and workspace[2*tid+1] will contain results for tid
+ *                  workspace[3*nthreads] and workspace[3*nthreads + 1] will contain sums
+ */
+void hypre_prefix_sum_pair(HYPRE_Int *in_out1, HYPRE_Int *sum1, HYPRE_Int *in_out2, HYPRE_Int *sum2, HYPRE_Int *workspace);
+/**
+ * @param workspace at least with length 3*(nthreads+1)
+ *                  workspace[3*tid:3*tid+3) will contain results for tid
+ */
+void hypre_prefix_sum_triple(HYPRE_Int *in_out1, HYPRE_Int *sum1, HYPRE_Int *in_out2, HYPRE_Int *sum2, HYPRE_Int *in_out3, HYPRE_Int *sum3, HYPRE_Int *workspace);
+
+/**
+ * n prefix-sums together.
+ * workspace[n*tid:n*(tid+1)) will contain results for tid
+ * workspace[nthreads*tid:nthreads*(tid+1)) will contain sums
+ *
+ * @param workspace at least with length n*(nthreads+1)
+ */
+void hypre_prefix_sum_multiple(HYPRE_Int *in_out, HYPRE_Int *sum, HYPRE_Int n, HYPRE_Int *workspace);
+
+/* hypre_merge_sort.c */
+/**
+ * Why merge sort?
+ * 1) Merge sort can take advantage of eliminating duplicates.
+ * 2) Merge sort is more efficiently parallelizable than qsort
+ */
+
+/**
+ * Out of place merge sort with duplicate elimination
+ * @ret number of unique elements
+ */
+HYPRE_Int hypre_merge_sort_unique(HYPRE_Int *in, HYPRE_Int *out, HYPRE_Int len);
+/**
+ * Out of place merge sort with duplicate elimination
+ *
+ * @param out pointer to output can be in or temp
+ * @ret number of unique elements
+ */
+HYPRE_Int hypre_merge_sort_unique2(HYPRE_Int *in, HYPRE_Int *temp, HYPRE_Int len, HYPRE_Int **out);
+
+void hypre_merge_sort(HYPRE_Int *in, HYPRE_Int *temp, HYPRE_Int len, HYPRE_Int **sorted);
+
+/* hypre_hopscotch_hash.c */
+
+#ifdef HYPRE_USING_OPENMP
+
+/* Check if atomic operations are available to use concurrent hopscotch hash table */
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) && (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40100
+#define HYPRE_USING_ATOMIC 
+//#elif defined _MSC_VER // JSP: haven't tested, so comment out for now
+//#define HYPRE_USING_ATOMIC
+//#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
+// JSP: not many compilers have implemented this, so comment out for now
+//#define HYPRE_USING_ATOMIC
+//#include <stdatomic.h>
+#endif
+
+#endif // HYPRE_USING_OPENMP
+
+#ifdef HYPRE_HOPSCOTCH
+#ifdef HYPRE_USING_ATOMIC
+// concurrent hopscotch hashing is possible only with atomic supports
+#define HYPRE_CONCURRENT_HOPSCOTCH 
+#endif 
+#endif 
+
+#ifdef HYPRE_CONCURRENT_HOPSCOTCH
+typedef struct {
+  HYPRE_Int volatile timestamp;
+  omp_lock_t         lock;
+} hypre_HopscotchSegment;
+#endif
+
+/**
+ * The current typical use case of unordered set is putting input sequence
+ * with lots of duplication (putting all colidx received from other ranks),
+ * followed by one sweep of enumeration.
+ * Since the capacity is set to the number of inputs, which is much larger
+ * than the number of unique elements, we optimize for initialization and
+ * enumeration whose time is proportional to the capacity.
+ * For initialization and enumeration, structure of array (SoA) is better
+ * for vectorization, cache line utilization, and so on.
+ */
+typedef struct
+{
+	HYPRE_Int  volatile              segmentMask;
+	HYPRE_Int  volatile              bucketMask;
+#ifdef HYPRE_CONCURRENT_HOPSCOTCH
+	hypre_HopscotchSegment* volatile segments;
+#endif
+  HYPRE_Int *volatile              key;
+  hypre_uint *volatile             hopInfo;
+	HYPRE_Int *volatile	             hash;
+} hypre_UnorderedIntSet;
+
+typedef struct
+{
+  hypre_uint volatile hopInfo;
+  HYPRE_Int  volatile hash;
+  HYPRE_Int  volatile key;
+  HYPRE_Int  volatile data;
+} hypre_HopscotchBucket;
+
+/**
+ * The current typical use case of unoredered map is putting input sequence
+ * with no duplication (inverse map of a bijective mapping) followed by
+ * lots of lookups.
+ * For lookup, array of structure (AoS) gives better cache line utilization.
+ */
+typedef struct
+{
+	HYPRE_Int  volatile              segmentMask;
+	HYPRE_Int  volatile              bucketMask;
+#ifdef HYPRE_CONCURRENT_HOPSCOTCH
+	hypre_HopscotchSegment*	volatile segments;
+#endif
+	hypre_HopscotchBucket* volatile	 table;
+} hypre_UnorderedIntMap;
+
+/**
+ * Sort array "in" with length len and put result in array "out"
+ * "in" will be deallocated unless in == *out
+ * inverse_map is an inverse hash table s.t. inverse_map[i] = j iff (*out)[j] = i
+ */
+void hypre_sort_and_create_inverse_map(
+  HYPRE_Int *in, HYPRE_Int len, HYPRE_Int **out, hypre_UnorderedIntMap *inverse_map);
diff --git a/src/utilities/random.c b/src/utilities/random.c
index 2dac3d8..91b806a 100644
--- a/src/utilities/random.c
+++ b/src/utilities/random.c
@@ -38,25 +38,39 @@
 
 #include "_hypre_utilities.h"
 
-/*--------------------------------------------------------------------------
- * Static variables
- *--------------------------------------------------------------------------*/
+#if defined(HYPRE_MEMORY_GPU) || defined(HYPRE_USE_MANAGED)
+__managed__ __device__
+#endif
 
+/*-------------------------------------------------------------------------------
+ * Static global variable: Seed
+ * ``... all initial seeds between 1 and 2147483646 (2^31-2) are equally valid''
+ *-------------------------------------------------------------------------------*/
 static HYPRE_Int Seed = 13579;
 
-#define a  16807
-#define m  2147483647
-#define q  127773
-#define r  2836
+#define a  16807      /* 7^5 */
+#define m  2147483647 /* 2*31 - 1 */
+#define q  127773     /* m div a */
+#define r  2836       /* m mod a */
 
 /*--------------------------------------------------------------------------
  * Initializes the pseudo-random number generator to a place in the sequence.
  *
  * @param seed an HYPRE_Int containing the seed for the RNG.
  *--------------------------------------------------------------------------*/
-
+HYPRE_CUDA_GLOBAL
 void  hypre_SeedRand( HYPRE_Int seed )
 {
+   /* RL: seed must be between 1 and 2^31-2 */
+   if (seed < 1) 
+   {
+      seed = 1;
+   }
+   else if (seed >= m)
+   {
+     seed = m - 1;
+   }
+
    Seed = seed;
 }
 
@@ -64,11 +78,10 @@ void  hypre_SeedRand( HYPRE_Int seed )
  * Computes the next pseudo-random number in the sequence using the global
  * variable Seed.
  *
- * @return a HYPRE_Real containing the next number in the sequence divided by
- * 2147483647 so that the numbers are in (0, 1].
+ * @return a HYPRE_Int between (0, 2147483647]
  *--------------------------------------------------------------------------*/
-
-HYPRE_Real  hypre_Rand()
+HYPRE_CUDA_GLOBAL
+HYPRE_Int  hypre_RandI()
 {
    HYPRE_Int  low, high, test;
 
@@ -84,5 +97,19 @@ HYPRE_Real  hypre_Rand()
       Seed = test + m;
    }
 
-   return ((HYPRE_Real)(Seed) / m);
+   return Seed;
 }
+
+/*--------------------------------------------------------------------------
+ * Computes the next pseudo-random number in the sequence using the global
+ * variable Seed.
+ *
+ * @return a HYPRE_Real containing the next number in the sequence divided by
+ * 2147483647 so that the numbers are in (0, 1].
+ *--------------------------------------------------------------------------*/
+HYPRE_CUDA_GLOBAL
+HYPRE_Real  hypre_Rand()
+{
+   return ((HYPRE_Real)(hypre_RandI()) / m);
+}
+
diff --git a/src/utilities/threading.h b/src/utilities/threading.h
index ab9e478..1d756cc 100644
--- a/src/utilities/threading.h
+++ b/src/utilities/threading.h
@@ -30,3 +30,4 @@ HYPRE_Int hypre_GetThreadNum( void );
 void hypre_GetSimpleThreadPartition( HYPRE_Int *begin, HYPRE_Int *end, HYPRE_Int n );
 
 #endif
+
diff --git a/src/utilities/timing.c b/src/utilities/timing.c
index c84a61c..00999df 100644
--- a/src/utilities/timing.c
+++ b/src/utilities/timing.c
@@ -328,9 +328,9 @@ hypre_PrintTiming( const char     *heading,
          local_wall_time = hypre_TimingWallTime(i);
          local_cpu_time  = hypre_TimingCPUTime(i);
          hypre_MPI_Allreduce(&local_wall_time, &wall_time, 1,
-                       hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
+                       hypre_MPI_REAL, hypre_MPI_MAX, comm);
          hypre_MPI_Allreduce(&local_cpu_time, &cpu_time, 1,
-                       hypre_MPI_DOUBLE, hypre_MPI_MAX, comm);
+                       hypre_MPI_REAL, hypre_MPI_MAX, comm);
 
          if (myrank == 0)
          {
diff --git a/src/utilities/timing.h b/src/utilities/timing.h
index 5cfd03d..64cc190 100644
--- a/src/utilities/timing.h
+++ b/src/utilities/timing.h
@@ -48,8 +48,8 @@ HYPRE_Real time_get_cpu_seconds_( void );
 #define hypre_IncFLOPCount(inc)
 #define hypre_BeginTiming(i)
 #define hypre_EndTiming(i)
-#define hypre_ClearTiming()
 #define hypre_PrintTiming(heading, comm)
+#define hypre_ClearTiming()
 
 /*--------------------------------------------------------------------------
  * With timing on
@@ -119,3 +119,4 @@ HYPRE_Int hypre_PrintTiming( const char *heading , MPI_Comm comm );
 #endif
 
 #endif
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/hypre.git