[p4est] 01/04: New upstream version 1.1+git20161115

Matthias Maier tamiko-guest at moszumanska.debian.org
Tue Nov 15 21:06:24 UTC 2016


This is an automated email from the git hooks/post-receive script.

tamiko-guest pushed a commit to branch master
in repository p4est.

commit 6fd5c2e00580cefa292cd2e037dac1cc0df5ae59
Author: Matthias Maier <tamiko+DEBIAN at kyomu.43-1.org>
Date:   Tue Nov 15 20:27:51 2016 +0000

    New upstream version 1.1+git20161115
---
 .gitmodules                                        |    1 -
 AUTHORS                                            |    6 +
 Doxyfile.in                                        |    6 +-
 Makefile.am                                        |    2 +-
 README                                             |   45 +-
 ...e-c27b6579bbf507787146bd88e1b2c4f998a9ea07.yaml |   42 +
 build-aux/git-version-gen                          |    2 +-
 config/p4est_petsc.m4                              |   87 +-
 configure.ac                                       |    4 +-
 doc/FREEBSD                                        |   23 +
 doc/announce/nadv14no37.txt                        |   36 +
 doc/attic/p8est_trilinear.c                        |    1 +
 doc/attic/p8est_trilinear.h                        |    1 +
 doc/attic/p8est_trilinear_lnodes.c                 |    1 +
 doc/attic/p8est_trilinear_nodes.c                  |    1 +
 doc/author_lahnert.txt                             |    1 +
 doc/mainpage.dox                                   |    9 +-
 doc/morton/morton.py                               |  237 +++
 doc/octreemap.pdf                                  |  Bin 0 -> 101014 bytes
 doc/p4est-howto.pdf                                |  Bin 0 -> 474947 bytes
 doc/p4est_license_approval.pdf                     |  Bin 0 -> 58924 bytes
 doc/webpage/index.html                             |  107 +-
 example/balance_seeds/balance_seeds2.c             |   27 +-
 example/balance_seeds/balance_seeds3.c             |    1 +
 example/mesh/mesh2.c                               |   86 +-
 example/mesh/mesh3.c                               |    1 +
 example/mesh/periodicity3.c                        |    1 +
 example/p6est/Makefile.am                          |   25 -
 example/p6est/p6est_io.c                           |    0
 example/p6est/p6est_io.h                           |    0
 example/points/points2.c                           |    1 +
 example/points/points3.c                           |    1 +
 example/simple/simple2.c                           |    1 +
 example/simple/simple3.c                           |    1 +
 example/steps/p4est_step1.c                        |    1 +
 example/steps/p4est_step2.c                        |    1 +
 example/steps/p4est_step3.c                        |   61 +-
 example/steps/p4est_step4.c                        |    5 +-
 example/steps/p8est_step1.c                        |    1 +
 example/steps/p8est_step2.c                        |    1 +
 example/steps/p8est_step3.c                        |    1 +
 example/steps/p8est_step4.c                        |    1 +
 example/tetgen/read_conn2.c                        |    4 +-
 example/tetgen/read_conn3.c                        |    4 +-
 example/tetgen/read_tetgen.c                       |    1 +
 example/tetgen/write_conn2.c                       |    4 +-
 example/tetgen/write_conn3.c                       |    4 +-
 example/timings/bricks2.c                          |    1 +
 example/timings/bricks3.c                          |    1 +
 example/timings/loadconn2.c                        |    1 +
 example/timings/loadconn3.c                        |    1 +
 example/timings/timings2.c                         |   23 +-
 example/timings/timings3.c                         |    1 +
 example/timings/tsearch3.c                         |   13 +-
 sc/AUTHORS                                         |   10 +
 sc/Doxyfile.in                                     |    2 +-
 sc/Makefile.am                                     |    8 +-
 sc/README                                          |   12 +-
 sc/config/sc_include.m4                            |    2 +
 sc/config/sc_memalign.m4                           |   96 ++
 sc/config/sc_mpi.m4                                |   63 +
 sc/config/sc_openmp.m4                             |   49 +
 sc/config/sc_pthread.m4                            |    5 +
 sc/configure.ac                                    |   34 +-
 sc/doc/FREEBSD                                     |   23 +
 sc/doc/author_kestener.txt                         |    1 +
 sc/doc/sc_license_approval.pdf                     |  Bin 0 -> 44626 bytes
 sc/doc/v1.2.branches                               |   32 +
 sc/example/bspline/bspline.c                       |    1 +
 sc/example/bspline/bspline_per.c                   |    1 +
 sc/example/dmatrix/dmatrix.c                       |    1 +
 sc/example/function/function.c                     |    1 +
 sc/example/logging/logging.c                       |    1 +
 sc/example/openmp/Makefile.am                      |   13 +
 sc/example/openmp/config_with_omp.txt              |   11 +
 sc/example/openmp/juqueen_job.js                   |   26 +
 sc/example/openmp/openmp.c                         |   42 +
 sc/example/options/Makefile.am                     |    7 +-
 sc/example/options/logging.c                       |   89 ++
 sc/example/options/options.c                       |   24 +-
 .../{options.ini => sc_options_example.ini}        |    0
 .../{preload.ini => sc_options_preload.ini}        |    0
 sc/example/pthread/condvar.c                       |    1 +
 sc/example/pthread/pthread.c                       |    3 +-
 sc/example/testing/Makefile.am                     |    9 +
 sc/example/testing/sc_test_shmem.c                 |  324 ++++
 sc/example/warp/warp.c                             |    1 +
 sc/iniparser/Makefile.am                           |    2 +-
 .../{twisted.ini => sc_iniparser_twisted.ini}      |    0
 sc/src/Makefile.am                                 |    6 +-
 sc/src/sc.c                                        |  391 ++++-
 sc/src/sc.h                                        |  121 +-
 sc/src/sc_allgather.c                              |    1 +
 sc/src/sc_allgather.h                              |    1 +
 sc/src/sc_amr.c                                    |    1 +
 sc/src/sc_amr.h                                    |    1 +
 sc/src/sc_blas.c                                   |    3 +-
 sc/src/sc_blas.h                                   |   16 +-
 sc/src/sc_bspline.c                                |    1 +
 sc/src/sc_bspline.h                                |    1 +
 sc/src/sc_containers.c                             |   93 +-
 sc/src/sc_containers.h                             |  107 +-
 sc/src/sc_dmatrix.c                                |  170 ++-
 sc/src/sc_dmatrix.h                                |  236 ++-
 sc/src/sc_flops.c                                  |    1 +
 sc/src/sc_flops.h                                  |    1 +
 sc/src/sc_functions.c                              |   55 +
 sc/src/sc_functions.h                              |   30 +
 sc/src/sc_getopt.h                                 |    1 +
 sc/src/sc_io.c                                     |   89 +-
 sc/src/sc_io.h                                     |   37 +-
 sc/src/sc_keyvalue.c                               |   40 +
 sc/src/sc_keyvalue.h                               |  154 +-
 sc/src/sc_lapack.c                                 |    3 +-
 sc/src/sc_lapack.h                                 |   25 +-
 sc/src/sc_lua.h                                    |    1 +
 sc/src/sc_mpi.c                                    |  383 ++++-
 sc/src/sc_mpi.h                                    |  121 +-
 sc/src/sc_notify.c                                 |    1 +
 sc/src/sc_notify.h                                 |    1 +
 sc/src/sc_obstack.h                                |    1 +
 sc/src/sc_options.c                                |  257 +++-
 sc/src/sc_options.h                                |  233 +--
 sc/src/{sc_sort.h => sc_private.h}                 |   32 +-
 sc/src/sc_ranges.c                                 |    1 +
 sc/src/sc_ranges.h                                 |    1 +
 sc/src/sc_reduce.c                                 |    1 +
 sc/src/sc_reduce.h                                 |    1 +
 sc/src/sc_refcount.c                               |  110 ++
 sc/src/sc_refcount.h                               |  120 ++
 sc/src/sc_search.c                                 |    3 +-
 sc/src/sc_search.h                                 |    1 +
 sc/src/sc_shmem.c                                  |  928 ++++++++++++
 sc/src/sc_shmem.h                                  |  169 +++
 sc/src/sc_sort.c                                   |    1 +
 sc/src/sc_sort.h                                   |    1 +
 sc/src/sc_statistics.c                             |    9 +
 sc/src/sc_statistics.h                             |    4 +
 sc/src/sc_string.c                                 |  102 ++
 sc/src/sc_string.h                                 |  107 ++
 sc/src/sc_unique_counter.c                         |   77 +
 sc/src/sc_unique_counter.h                         |   76 +
 sc/src/sc_warp.c                                   |    1 +
 sc/src/sc_warp.h                                   |    1 +
 sc/test/Makefile.am                                |    5 +
 sc/test/test_allgather.c                           |    1 +
 sc/test/test_arrays.c                              |    5 +
 sc/test/test_builtin.c                             |    1 +
 sc/test/test_darray_work.c                         |   81 +
 sc/test/test_dmatrix.c                             |  203 ++-
 sc/test/test_dmatrix_pool.c                        |    1 +
 sc/test/test_io_sink.c                             |    3 +-
 sc/test/test_keyvalue.c                            |    1 +
 sc/test/test_node_comm.c                           |  147 ++
 sc/test/test_notify.c                              |    1 +
 sc/test/test_pqueue.c                              |    1 +
 sc/test/test_reduce.c                              |    1 +
 sc/test/test_search.c                              |    1 +
 sc/test/test_sort.c                                |    1 +
 sc/test/test_sortb.c                               |    1 +
 src/Makefile.am                                    |   17 +-
 src/p4est.c                                        |  306 ++--
 src/p4est.h                                        |   44 +-
 src/p4est_algorithms.c                             |  583 +++----
 src/p4est_algorithms.h                             |    7 +
 src/p4est_balance.c                                |    3 +-
 src/p4est_balance.h                                |    3 +-
 src/p4est_base.c                                   |    1 +
 src/p4est_base.h                                   |   44 +-
 src/p4est_bits.c                                   |   24 +
 src/p4est_bits.h                                   |   10 +
 src/p4est_communication.c                          |  287 +++-
 src/p4est_communication.h                          |  108 +-
 src/p4est_connectivity.c                           |  620 ++++++--
 src/p4est_connectivity.h                           |   46 +-
 src/p4est_connrefine.c                             |  179 +++
 src/p4est_extended.h                               |   91 ++
 src/p4est_geometry.c                               |    1 +
 src/p4est_geometry.h                               |    4 +-
 src/p4est_ghost.c                                  |  286 +++-
 src/p4est_ghost.h                                  |   86 +-
 src/p4est_io.c                                     |   22 +-
 src/p4est_io.h                                     |    5 +-
 src/p4est_iterate.c                                |    1 +
 src/p4est_iterate.h                                |    5 +-
 src/p4est_lnodes.c                                 |  250 ++-
 src/p4est_lnodes.h                                 |   28 +-
 src/p4est_mesh.c                                   |  106 +-
 src/p4est_mesh.h                                   |    3 +-
 src/p4est_nodes.c                                  |    1 +
 src/p4est_nodes.h                                  |    1 +
 src/p4est_plex.c                                   |  660 ++++++--
 src/p4est_plex.h                                   |    1 +
 src/p4est_points.c                                 |    7 +-
 src/p4est_points.h                                 |    1 +
 src/p4est_search.c                                 |  176 ++-
 src/p4est_search.h                                 |   68 +-
 src/p4est_to_p8est.h                               |   86 +-
 src/p4est_vtk.c                                    | 1591 ++++++++++++++------
 src/p4est_vtk.h                                    |  305 ++--
 src/p4est_wrap.c                                   |  492 +++++-
 src/p4est_wrap.h                                   |  201 ++-
 {example/p6est => src}/p6est.c                     |  470 +++++-
 {example/p6est => src}/p6est.h                     |   36 +-
 src/p6est_communication.c                          |  196 +++
 src/p6est_communication.h                          |  114 ++
 {example/p6est => src}/p6est_extended.h            |   30 +-
 {example/p6est => src}/p6est_ghost.c               |    3 +-
 {example/p6est => src}/p6est_ghost.h               |    9 +-
 {example/p6est => src}/p6est_lnodes.c              |   82 +-
 {example/p6est => src}/p6est_lnodes.h              |   15 +-
 {example/p6est => src}/p6est_profile.c             |   42 +-
 {example/p6est => src}/p6est_profile.h             |    9 +-
 {example/p6est => src}/p6est_vtk.c                 |    3 +-
 {example/p6est => src}/p6est_vtk.h                 |    3 +-
 src/p8est.c                                        |    1 +
 src/p8est.h                                        |   38 +-
 src/p8est_algorithms.c                             |    1 +
 src/p8est_algorithms.h                             |    7 +
 src/p8est_balance.c                                |    3 +-
 src/p8est_balance.h                                |    3 +-
 src/p8est_bits.c                                   |    1 +
 src/p8est_bits.h                                   |   10 +
 src/p8est_communication.c                          |    1 +
 src/p8est_communication.h                          |  108 +-
 src/p8est_connectivity.c                           |  200 ++-
 src/p8est_connectivity.h                           |  102 +-
 .../timings/bricks3.c => src/p8est_connrefine.c    |    3 +-
 src/p8est_extended.h                               |   91 ++
 src/p8est_geometry.c                               |    1 +
 src/p8est_geometry.h                               |    4 +-
 src/p8est_ghost.c                                  |    1 +
 src/p8est_ghost.h                                  |   88 +-
 src/p8est_io.c                                     |    2 +-
 src/p8est_io.h                                     |    5 +-
 src/p8est_iterate.c                                |    1 +
 src/p8est_iterate.h                                |    7 +-
 src/p8est_lnodes.c                                 |    1 +
 src/p8est_lnodes.h                                 |   31 +-
 src/p8est_mesh.c                                   |    1 +
 src/p8est_mesh.h                                   |    5 +-
 src/p8est_nodes.c                                  |    1 +
 src/p8est_nodes.h                                  |    1 +
 src/p8est_plex.c                                   |    1 +
 src/p8est_plex.h                                   |    1 +
 src/p8est_points.c                                 |    1 +
 src/p8est_points.h                                 |    1 +
 src/p8est_search.c                                 |    1 +
 src/p8est_search.h                                 |  107 +-
 src/p8est_tets_hexes.c                             |    1 +
 src/p8est_tets_hexes.h                             |    1 +
 src/p8est_vtk.c                                    |    1 +
 src/p8est_vtk.h                                    |  310 ++--
 src/p8est_wrap.c                                   |    4 +-
 src/p8est_wrap.h                                   |  206 ++-
 test/Makefile.am                                   |   76 +-
 example/p6est/test/test_all.c => test/test_all6.c  |   12 +-
 test/test_balance2.c                               |    1 +
 test/test_balance3.c                               |    1 +
 test/test_balance_seeds2.c                         |    3 +-
 test/test_balance_seeds3.c                         |    5 +-
 test/test_balance_type2.c                          |    1 +
 test/test_balance_type3.c                          |    1 +
 test/test_brick2.c                                 |    1 +
 test/test_brick3.c                                 |    1 +
 test/test_coarsen2.c                               |    1 +
 test/test_coarsen3.c                               |    1 +
 test/test_comm.c                                   |    1 +
 test/test_complete_subtree.c                       |  165 ++
 test/test_conn_complete2.c                         |    1 +
 test/test_conn_complete3.c                         |    1 +
 test/test_conn_reduce2.c                           |    1 +
 test/test_conn_reduce3.c                           |    1 +
 test/test_conn_transformation2.c                   |  218 +++
 .../bricks3.c => test/test_conn_transformation3.c  |    3 +-
 test/{test_join2.c => test_connrefine2.c}          |   29 +-
 .../timings/bricks3.c => test/test_connrefine3.c   |    3 +-
 test/test_edge_face_corners3.c                     |    1 +
 test/test_face_transform3.c                        |    1 +
 test/test_ghost2.c                                 |   68 +
 test/test_ghost3.c                                 |    1 +
 test/test_hash.c                                   |    1 +
 test/test_iterate2.c                               |    3 +-
 test/test_iterate3.c                               |    3 +-
 test/test_join2.c                                  |   14 +-
 test/test_join3.c                                  |    3 +-
 test/test_lnodes2.c                                |    3 +-
 test/test_lnodes3.c                                |    3 +-
 test/test_load2.c                                  |  109 ++
 example/timings/bricks3.c => test/test_load3.c     |    3 +-
 test/test_loadsave2.c                              |    1 +
 test/test_loadsave3.c                              |    1 +
 test/test_order.c                                  |    1 +
 test/test_partition2.c                             |   17 +-
 test/test_partition3.c                             |    1 +
 test/test_partition_corr2.c                        |    1 +
 test/test_partition_corr3.c                        |    1 +
 test/test_periodic3.c                              |   49 +-
 test/test_plex2.c                                  |  131 +-
 test/test_plex3.c                                  |    3 +-
 test/test_quadrants2.c                             |    1 +
 test/test_quadrants3.c                             |    1 +
 test/test_reorder2.c                               |    3 +-
 test/test_reorder3.c                               |    3 +-
 test/test_replace2.c                               |    3 +-
 test/test_replace3.c                               |    3 +-
 test/test_search2.c                                |   31 +-
 test/test_search3.c                                |    1 +
 test/test_subcomm2.c                               |  291 ++++
 example/timings/bricks3.c => test/test_subcomm3.c  |    3 +-
 test/test_valid2.c                                 |   29 +-
 test/test_valid3.c                                 |    1 +
 test/test_wrap2.c                                  |   65 +-
 test/test_wrap3.c                                  |    4 +-
 314 files changed, 14550 insertions(+), 2524 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 7375c24..a4b8480 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,3 @@
 [submodule "sc"]
 	path = sc
 	url = https://github.com/cburstedde/libsc.git
-	# url = http://burstedde.ins.uni-bonn.de/repos/libsc.git
diff --git a/AUTHORS b/AUTHORS
index 5296975..986d531 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -37,6 +38,7 @@
   Ethan Alan Hereth <e.a.hereth at gmail.com>
   Alex Fikl <alexfikl at gmail.com>
   Hannes Frank <frank at iag.uni-stuttgart.de>
+  Michael Lahnert <michael.lahnert at ipvs.uni-stuttgart.de>
 */
 
 /*
@@ -44,6 +46,8 @@
 
   Matthias Maier <matthias.maier at iwr.uni-heidelberg.de>
   Fabian Hempert <fabian.hempert at de.bosch.com>
+  Denis Davydov <davydden at gmail.com>
+  Wolfgang Bangerth <bangerth at math.tamu.edu>
 */
 
 /*
@@ -66,6 +70,8 @@
   The subdirectory sc contains the SC Library licensed under GNU LGPL.
   See the files sc/AUTHORS and sc/COPYING for copyright information.
 
+  For information on individual contributions see doc/author_*.txt.
+
   Several scripts under config are released under GNU GPL versions.
   These are not compiled or linked with p4est, so we consider them
   independent programs that are not part of p4est.
diff --git a/Doxyfile.in b/Doxyfile.in
index 183714e..7abfcc9 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -371,7 +371,7 @@ INLINE_SIMPLE_STRUCTS  = NO
 # types are typedef'ed and only the typedef is referenced, never the tag name.
 # The default value is: NO.
 
-TYPEDEF_HIDES_STRUCT   = YES
+TYPEDEF_HIDES_STRUCT   = NO
 
 # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
 # cache is used to resolve symbols given their name and scope. Since this can be
@@ -997,7 +997,9 @@ COLS_IN_ALPHA_INDEX    = 3
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = p4est_ \
+IGNORE_PREFIX          = p2est_ \
+                         p4est_ \
+                         p6est_ \
                          p8est_
 
 #---------------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index 3556c7a..2bdad33 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -108,7 +108,7 @@ if P4EST_DIST_DENY
 	@exit 1
 endif
 
-clean-local: clean-local-p6est
+clean-local:
 	rm -f ChangeLog *vtu *.visit *.p4c *.p4p *.p8c *.p8p
 
 maintainer-clean-local:
diff --git a/README b/README
index 2634b48..c0ea623 100644
--- a/README
+++ b/README
@@ -4,20 +4,23 @@ This is the README file for p4est.
 p4est is a C library to manage a collection (a forest) of multiple
 connected adaptive quadtrees or octrees in parallel.
 
+Copyright (C) 2010 The University of Texas System
+Additional copyright (C) 2011 individual authors
+
 p4est is written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
-and released under the GNU General Public Licence version 2 (or later),
-Copyright (C) 2010 The University of Texas System.
+and released under the GNU General Public Licence version 2 (or later).
 
 The official web page for source code and documentation is www.p4est.org.
-Please send bug reports and ideas for contribution to p4est at librelist.com.
+Please send bug reports and ideas for contribution to info at p4est.org.
 
 
  0. Acknowledgement and Disclaimer
 
 The development of p4est was partially supported by the US National Science
-Foundation (NSF Grants No. OCI-0749334, CCF-0427985, CNS-0540372, CNS-0619838,
-DMS-0724746, OPP-0941678) and the US Department of Energy (DOE Grants No.
-06ER25782, 08ER25860, SC0002710).  The authors thank the Texas Advanced
+Foundation (NSF Grants No. CCF-0427985, CMMI-1028889, CNS-0540372, CNS-0619838,
+DMS-0724746, OCI-0749334, OPP-0941678) and the US Department of Energy (DOE
+Grants No. 06ER25782, 08ER25860, SC0002710).
+The authors thank the Texas Advanced
 Computing Center (TACC) for providing them with access to the Ranger
 supercomputer under NSF TeraGrid award MCA04N026, and the National Center for
 Computational Science (NCCS) for early-user access to the Jaguar Cray XT5
@@ -88,15 +91,16 @@ applications.  By querying the ghost layer, the application can associate
 degrees of freedom with the mesh which are the basis for all numerical
 computation.
 
-p?est_nodes: Create a globally unique numbering of the mesh nodes (i.e., the
-vertices at the corners of octants, not to be confused with octree nodes),
-taking into account the classification into ``independent'' and ``hanging''
-nodes.  This function requires previous calls to p?est_balance and p?est_ghost.
-It provides a global numbering for degrees of freedom that can be associated
-with piecewise linear finite elements.
+p?est_lnodes: Create a globally unique numbering of finite element degrees of
+freedom for arbitrary order C0 nodal polynomials, also known as GLL basis.  The
+function takes into account the classification into ``independent'' and
+``hanging'' degrees of freedom.  Numberings for standard piecewise d-linear
+finite elements or lowest-order Raviart-Thomas velocity variables are created
+by calling this function with the parameters degree = 1 and -1, respectively.
+This function requires previous calls to p?est_balance and p?est_ghost.
 
-p?est_lnodes: Like p?est_nodes, but for arbitrary order Lagrangian basis
-functions.  Supersedes p?est_nodes for the piecewise linear special case.
+p?est_nodes: Like p?est_lnodes, but only for piecewise linear finite elements.
+Technically no longer required, but we keep it around for its simplicity.
 
 
  5. Installation from a release tarball
@@ -124,6 +128,13 @@ CFLAGS="-Wall -g -O0" is an example of setting flags for a development build
 compiled with the GNU compilers.  p4est is written in plain C and does not
 require FFLAGS or CXXFLAGS.
 
+So, with a gcc style compiler a good debug configure line would be
+  /path/to/configure CFLAGS="-O0 -g -Wall -Wuninitialized" \
+                     --enable-debug --enable-mpi --disable-shared
+and for a production build use for example
+  /path/to/configure CFLAGS="-O2 -Wall -Wno-unused-but-set-variable" \
+                     --enable-mpi
+
 Both in-source and out-of-source builds are supported.  For a one-time
 installation, ./configure can be called in the root directory of the unpacked
 tree.  For development or for testing multiple configure options, create an
@@ -158,8 +169,10 @@ If you have obtained p4est from a git repository, such as via the shell command
    git clone https://github.com/cburstedde/p4est.git
 then the libsc submodule, which resides in the subdirectory sc, must be
 downloaded before configuring and compiling.  This can be accomplished with the
-shell command 'git submodule init; git submodule update'.  After the submodule
-has been obtained, you can install from source as described above in 6.
+shell commands
+   git submodule init; git submodule update
+After the submodule has been obtained, install from source as described above
+in 6.
 
 
  8. Using p4est through the deal.II interface
diff --git a/bugs/issue-c27b6579bbf507787146bd88e1b2c4f998a9ea07.yaml b/bugs/issue-c27b6579bbf507787146bd88e1b2c4f998a9ea07.yaml
new file mode 100644
index 0000000..7ba47c6
--- /dev/null
+++ b/bugs/issue-c27b6579bbf507787146bd88e1b2c4f998a9ea07.yaml
@@ -0,0 +1,42 @@
+--- !ditz.rubyforge.org,2008-03-06/issue 
+title: Check on p4est_complete_subtree
+desc: |-
+  I've added a branch fix-complete-subtree that introduces a program
+  test/p4est_test_complete_subtree.  It crashes for me on 2 processes.
+  Basically I'm calling complete_subtree with an array that is one quadrant
+  and would expect it to fill the range between first_desc and last_desc in
+  the coarsest possible way, including that quadrant.
+  
+  I also have a proposed fix in 0211af53fc429d0deb586c761d4b5e09acc62fdf
+  that I'm not triggering with this version of the test, but it should be
+  possible to reproduce the assertion child_id != 0 without the fix.
+  But then the fix leaves the following question: if I replace a quadrant
+  by its sibling with child id 0, I may be moving it before first_desc,
+  thus creating a quadrant that is invalid for that tree.  Is this issue
+  being considered?
+  
+  It is also possible that the crash in the test is not complete_subtree's
+  fault but goes back to find_higher_bound; at least worth checking.
+  
+  I'm also suggesting to expand the docs on complete_subtree to clarify
+  what legal input data looks like (the p4est need not be valid),
+  especially regarding the level statistics in the tree.
+type: :bugfix
+component: p4est
+release: 
+reporter: Carsten Burstedde <burstedde at ins.uni-bonn.de>
+status: :closed
+disposition: :fixed
+creation_time: 2016-02-03 10:48:13.169545 Z
+references: []
+
+id: c27b6579bbf507787146bd88e1b2c4f998a9ea07
+log_events: 
+- - 2016-02-03 10:48:14.465530 Z
+  - Carsten Burstedde <burstedde at ins.uni-bonn.de>
+  - created
+  - ""
+- - 2016-02-04 15:58:59.711115 Z
+  - Tobin Isaac <tisaac at ices.utexas.edu>
+  - closed with disposition fixed
+  - Carsten was right, the new algorithm had assumptions that were only correct for input arrays that were already complete; there was also some buggy usage of memmove() and the counters were not properly updated when used from completion.
diff --git a/build-aux/git-version-gen b/build-aux/git-version-gen
index b6c9d41..f1a255c 100755
--- a/build-aux/git-version-gen
+++ b/build-aux/git-version-gen
@@ -141,7 +141,7 @@ v=`echo "$v" |sed 's/^v//'`
 # Don't declare a version "dirty" merely because a time stamp has changed.
 git status > /dev/null 2>&1
 
-dirty=`sh -c 'git diff-index --name-only HEAD' 2>/dev/null` || dirty=
+dirty=`sh -c 'git diff-index --name-only HEAD --ignore-submodules=untracked' 2>/dev/null` || dirty=
 case "$dirty" in
   '') ;;
   *) # Append the suffix only if there isn't one already.
diff --git a/config/p4est_petsc.m4 b/config/p4est_petsc.m4
index 6925889..c107616 100644
--- a/config/p4est_petsc.m4
+++ b/config/p4est_petsc.m4
@@ -3,49 +3,117 @@ dnl P4EST_CHECK_PETSC(PREFIX)
 dnl Check for the PETSc library and link a test program
 dnl portions adapted from libmesh petsc.m4
 dnl
+dnl This macro examines the argument to --with-petsc, if any,
+dnl and the environment variables PETSC_DIR and PETSC_ARCH.
+dnl It also tries to find the program petscarch and run it.
+dnl
+dnl If the environment variables exist they are used as defaults.
+dnl The petsc directory can be overwritten by the --with-petsc= argument.
+dnl If this fails, we set it to /usr/lib/petsc if the petscarch program exists.
+dnl The architecture is left at its default unless petscarch produces it.
+dnl
 AC_DEFUN([P4EST_CHECK_PETSC], [
 
+dnl we may need this program if the environment variables are insufficient
+AC_PATH_PROG([PETSCARCH_PROG], [petscarch])
 AC_MSG_CHECKING([for PETSc])
 
 SC_ARG_WITH_PREFIX([petsc], [enable PETSc-dependent code], [PETSC], [$1])
 $1_PETSC_INCLUDE_DIRS=
 $1_PETSC_LINK_LIBS=
 if test "x$$1_WITH_PETSC" != xno ; then
-  # use the PETSC_DIR environment variable by default
+  dnl use the PETSC_DIR and PETSC_ARCH environment variables by default
   $1_PETSC_DIR="$PETSC_DIR"
   $1_PETSC_ARCH="$PETSC_ARCH"
+
+  dnl see if we can find a better petsc directory
   if test "x$$1_WITH_PETSC" != xyes ; then
     $1_PETSC_DIR="$$1_WITH_PETSC"
   fi
-  if (test "x$$1_PETSC_DIR" = x); then
-    AC_PATH_PROG(PETSCARCH, petscarch)
-    if (test "x$PETSCARCH" != x); then
+  if test "x$$1_PETSC_DIR" = x; then
+    if test "x$PETSCARCH_PROG" != x; then
       $1_PETSC_DIR=/usr/lib/petsc
     fi
   fi
   if test ! -r $$1_PETSC_DIR/include/petscversion.h ; then
     AC_MSG_ERROR([Unable to find readable petscversion.h])
   fi
-  $1_PETSC_MAJOR=`grep "define PETSC_VERSION_MAJOR" $$1_PETSC_DIR/include/petscversion.h | sed -e "s/#define PETSC_VERSION_MAJOR[ ]*//g"`
+
+  dnl see if we can determine the petsc arch string
+  if test "x$$1_PETSC_ARCH" = x; then
+    if test "x$PETSCARCH_PROG" != x; then
+      $1_PETSC_ARCH=`$PETSCARCH_PROG`
+    fi
+  fi
+
+  dnl try to use the petsc configuration with the current variables
+  $1_PETSC_MAJOR=`grep "define PETSC_VERSION_MAJOR"                    \
+    $$1_PETSC_DIR/include/petscversion.h                               \
+    | sed -e "s/#define PETSC_VERSION_MAJOR[ ]*//g"`
   if test "$$1_PETSC_MAJOR" -lt 3 ; then
     AC_MSG_ERROR([PETSc version >= 3.0 required])
   fi
   if test -r $$1_PETSC_DIR/makefile ; then
-    $1_PETSC_LINK_LIBS=`make -s -C $$1_PETSC_DIR getlinklibs`
-    $1_PETSC_INCLUDE_DIRS=`make -s -C $$1_PETSC_DIR getincludedirs`
+    $1_PETSC_LINK_LIBS=`                                               \
+      export PETSC_DIR="$$1_PETSC_DIR" PETSC_ARCH="$$1_PETSC_ARCH";    \
+      make -s -C $$1_PETSC_DIR getlinklibs`
+    $1_PETSC_INCLUDE_DIRS=`                                            \
+      export PETSC_DIR="$$1_PETSC_DIR" PETSC_ARCH="$$1_PETSC_ARCH";    \
+      make -s -C $$1_PETSC_DIR getincludedirs`
   elif test -r $$1_PETSC_DIR/conf/variables ; then
     if ! test -r $$1_PETSC_DIR/conf/rules ; then
-      AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile or $$1_PETSC_DIR/conf/rules])
+      AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile\
+ or $$1_PETSC_DIR/conf/rules])
     fi
     cat <<EOF >Makefile_config_petsc
 include $$1_PETSC_DIR/conf/variables
 include $$1_PETSC_DIR/conf/rules
 EOF
+    $1_PETSC_LINK_LIBS=`                                               \
+      export PETSC_DIR="$$1_PETSC_DIR" PETSC_ARCH="$$1_PETSC_ARCH";    \
+      make -s -f Makefile_config_petsc getlinklibs`
+    $1_PETSC_INCLUDE_DIRS=`                                            \
+      export PETSC_DIR="$$1_PETSC_DIR" PETSC_ARCH="$$1_PETSC_ARCH";    \
+      make -s -f Makefile_config_petsc getincludedirs`
+    rm -f Makefile_config_petsc
+  elif test -r $$1_PETSC_DIR/lib/petsc-conf/variables ; then
+    if ! test -r $$1_PETSC_DIR/lib/petsc-conf/rules ; then
+      AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile or\
+ $$1_PETSC_DIR/lib/petsc-conf/rules])
+    fi
+    cat <<EOF >Makefile_config_petsc
+include $$1_PETSC_DIR/lib/petsc-conf/variables
+include $$1_PETSC_DIR/lib/petsc-conf/rules
+EOF
+    $1_PETSC_LINK_LIBS=`make -s -f Makefile_config_petsc getlinklibs`
+    $1_PETSC_INCLUDE_DIRS=`make -s -f Makefile_config_petsc getincludedirs`
+    rm -f Makefile_config_petsc
+  elif test -r $$1_PETSC_DIR/lib/petsc/conf/variables ; then
+    if ! test -r $$1_PETSC_DIR/lib/petsc/conf/rules ; then
+      AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile or\
+ $$1_PETSC_DIR/lib/petsc/conf/rules])
+    fi
+    cat <<EOF >Makefile_config_petsc
+include $$1_PETSC_DIR/lib/petsc/conf/variables
+include $$1_PETSC_DIR/lib/petsc/conf/rules
+EOF
+    $1_PETSC_LINK_LIBS=`make -s -f Makefile_config_petsc getlinklibs`
+    $1_PETSC_INCLUDE_DIRS=`make -s -f Makefile_config_petsc getincludedirs`
+    rm -f Makefile_config_petsc
+  elif test -r $$1_PETSC_DIR/lib/petsc-conf/variables ; then
+    if ! test -r $$1_PETSC_DIR/lib/petsc-conf/rules ; then
+      AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile or $$1_PETSC_DIR/lib/petsc-conf/rules])
+    fi
+    cat <<EOF >Makefile_config_petsc
+include $$1_PETSC_DIR/lib/petsc-conf/variables
+include $$1_PETSC_DIR/lib/petsc-conf/rules
+EOF
     $1_PETSC_LINK_LIBS=`make -s -f Makefile_config_petsc getlinklibs`
     $1_PETSC_INCLUDE_DIRS=`make -s -f Makefile_config_petsc getincludedirs`
     rm -f Makefile_config_petsc
   else 
-    AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile or $$1_PETSC_DIR/conf/variables])
+    AC_MSG_ERROR([Unable to find $$1_PETSC_DIR/makefile or $$1_PETSC_DIR/conf/variables\
+ or $$1_PETSC_DIR/lib/petsc-conf/variables or $$1_PETSC_DIR/lib/petsc/conf/variables])
   fi
   PRE_PETSC_CPPFLAGS="$CPPFLAGS"
   CPPFLAGS="$CPPFLAGS $$1_PETSC_INCLUDE_DIRS"
@@ -72,4 +140,3 @@ fi
 AC_SUBST([$1_PETSC_INCLUDE_DIRS])
 AC_SUBST([$1_PETSC_LINK_LIBS])
 ])
-
diff --git a/configure.ac b/configure.ac
index 485edbd..b20811f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@ dnl
 
 AC_INIT([p4est],
         [m4_esyscmd([build-aux/git-version-gen .tarball-version])],
-        [p4est at librelist.com])
+        [info at p4est.org])
 AC_PREREQ(2.61)
 AC_CONFIG_HEADERS([src/pre_config.h])
 AC_CONFIG_SRCDIR([src/p4est.h])
@@ -30,7 +30,7 @@ P4EST_ARG_DISABLE([vtk-zlib], [disable zlib compression for vtk binary data],
                   [VTK_COMPRESSION])
 P4EST_ARG_DISABLE([2d], [disable the 2D library], [BUILD_2D])
 P4EST_ARG_DISABLE([3d], [disable the 3D library], [BUILD_3D])
-P4EST_ARG_ENABLE([p6est], [enable hybrid 2D+1D p6est library], [BUILD_P6EST])
+P4EST_ARG_DISABLE([p6est], [disable hybrid 2D+1D p6est library], [BUILD_P6EST])
 
 echo "o---------------------------------------"
 echo "| Checking MPI and related programs"
diff --git a/doc/FREEBSD b/doc/FREEBSD
new file mode 100644
index 0000000..9c76845
--- /dev/null
+++ b/doc/FREEBSD
@@ -0,0 +1,23 @@
+Copyright (c) <YEAR>, <OWNER>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/doc/announce/nadv14no37.txt b/doc/announce/nadv14no37.txt
new file mode 100644
index 0000000..bbd3302
--- /dev/null
+++ b/doc/announce/nadv14no37.txt
@@ -0,0 +1,36 @@
+Subject: NA Digest, V. 14, # 37
+
+NA Digest  Thursday, October 23, 2014  Volume 14 : Issue 37
+
+Today's Topics:
+
+	New software release, p4est 1.1
+
+-------------------------------------------------------
+
+From: Carsten Burstedde burstedde at ins.uni-bonn.de
+Date: October 21, 2014
+Subject: New software release, p4est 1.1
+
+We are pleased to announce the release v1.1 of the p4est software.
+p4est is a C library to manage a collection (a forest) of multiple
+connected adaptive quadtrees or octrees in parallel.  p4est offers
+fast absolute-time and low-memory algorithms for a feature-complete
+set of octree-based AMR operations.  p4est has been shown to scale to
+the full size of the Jaguar (2011) and Juqueen (2014) supercomputers
+and is used in several production science applications.  This version
+is considered stable and includes all recent updates.
+
+The software is freely available at www.p4est.org where we also link
+to a howto document, technical papers, and inline source
+documentation.
+
+There is an archived mailing list for discussion of any p4est related
+issues; please see http://librelist.com/browser/p4est/.  Everybody is
+free to subscribe.  We are happy to help and respond to any issues
+that may arise.
+
+Enjoy!  The p4est developers,
+Carsten Burstedde
+Lucas C. Wilcox
+Tobin Isaac
diff --git a/doc/attic/p8est_trilinear.c b/doc/attic/p8est_trilinear.c
index 119a25c..debec7b 100644
--- a/doc/attic/p8est_trilinear.c
+++ b/doc/attic/p8est_trilinear.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/doc/attic/p8est_trilinear.h b/doc/attic/p8est_trilinear.h
index d35af77..d0cd7f7 100644
--- a/doc/attic/p8est_trilinear.h
+++ b/doc/attic/p8est_trilinear.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/doc/attic/p8est_trilinear_lnodes.c b/doc/attic/p8est_trilinear_lnodes.c
index 76355ce..a6b4a7c 100644
--- a/doc/attic/p8est_trilinear_lnodes.c
+++ b/doc/attic/p8est_trilinear_lnodes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/doc/attic/p8est_trilinear_nodes.c b/doc/attic/p8est_trilinear_nodes.c
index 91706a8..5a9f38b 100644
--- a/doc/attic/p8est_trilinear_nodes.c
+++ b/doc/attic/p8est_trilinear_nodes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/doc/author_lahnert.txt b/doc/author_lahnert.txt
new file mode 100644
index 0000000..ea4baff
--- /dev/null
+++ b/doc/author_lahnert.txt
@@ -0,0 +1 @@
+I place my contributions to p4est under the FreeBSD license. Michael Lahnert (michael.lahnert at ipvs.uni-stuttgart.de)
diff --git a/doc/mainpage.dox b/doc/mainpage.dox
index fea7c30..f88c88d 100644
--- a/doc/mainpage.dox
+++ b/doc/mainpage.dox
@@ -50,7 +50,7 @@
  * [p4est-howto.pdf](http://p4est.github.io/p4est-howto.pdf),
  * which is distributed with the
  * [source](http://github.com/cburstedde/p4est) under the
- * [doc/](https://github.com/cburstedde/p4est/tree/develop/doc) directory.
+ * [doc/](https://github.com/cburstedde/p4est/tree/master/doc) directory.
  *
  * To build the p4est library from a tar distribution, use the standard
  * procedure of the GNU autotools.  The configure script takes the following
@@ -64,10 +64,11 @@
  *                    in serial only.
  * * `--disable-mpiio`  may be used to avoid using `MPI_File` based calls.
  *
- * A typical development configure line looks as follows:
- * > `relative/path/to/configure CFLAGS="-Wall -O0 -g" --enable-mpi --enable-debug`
+ * A typical development configure line looks as follows (your mileage may vary
+ * wrt. compiler-dependent warning options):
+ * > `relative/path/to/configure CFLAGS="-Wall -Wuninitialized -O0 -g" --enable-mpi --enable-debug`
  * A typical production configure line looks as follows:
- * > `relative/path/to/configure CFLAGS="-Wall -O2" --enable-mpi`
+ * > `relative/path/to/configure CFLAGS="-Wall -Wno-unused-but-set-variable -O2" --enable-mpi`
  *
  * \see http://www.p4est.org/
  * \see http://www.gnu.org/licenses/licenses.html
diff --git a/doc/morton/morton.py b/doc/morton/morton.py
new file mode 100755
index 0000000..b8f3f60
--- /dev/null
+++ b/doc/morton/morton.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+
+import os,sys,argparse
+
+class MortonSegmentTable:
+  """ A table to enumerate the types of Morton segments of various lengths """
+
+  def __init__(self,dim,logfile=None):
+    self.dim        = dim
+    self.table      = {}
+    self.calculated = {}
+    self.caseOne    = []
+    self.caseWeak   = [None] * dim
+    self.logfile    = logfile
+    self.typestr    = {-1: "disconnected", 0: "strongly connected"}
+    for d in range(0,dim):
+      self.typestr[d + 1] = "weakly connected in direction %d" % d
+    if not logfile:
+      self.logfile  = os.devnull
+    return
+
+  def calculate_case_one(self):
+    if self.caseOne:
+      return
+    else:
+      self.logfile.write("===\nCalculate child segments of unitary segment\n")
+      for i in range (1,2**self.dim + 1):
+        if i == 1:
+          self.logfile.write("  %d child segments of length 1\n" % 2**self.dim)
+          self.caseOne.append((1,2**self.dim,0))
+        else:
+          self.logfile.write("  Counting child segments of length %d\n" % i)
+          disconnected = 0
+          connected = 0
+          weak = [0] * self.dim
+          for j in range (0,2**self.dim - i + 1):
+            self.logfile.write("    Considering segment of length %d starting at %d ..." % (i,j))
+            first = j;
+            last  = j + i - 1;
+            diff = last ^ first
+            diffDim = int.bit_length(diff) - 1
+            shifted = first ^ 2**diffDim
+            if shifted < last:
+              self.logfile.write(" disconnected\n")
+              connected += 1
+            elif shifted == last:
+              self.logfile.write(" weakly connected in direction %d\n" % diffDim)
+              weak[diffDim] += 1
+            else:
+              self.logfile.write(" strongly connected\n")
+              disconnected += 1
+          self.logfile.write("  Total disconnected length %d child segments: %d\n" % (i,disconnected))
+          self.caseOne.append((i,disconnected,-1))
+          self.logfile.write("  Total strongly connected length %d child segments: %d\n" % (i,connected))
+          self.caseOne.append((i,connected,0))
+          for d in range(0,self.dim):
+            self.logfile.write("  Total weakly connected length %d child segments in direction %d: %d\n" % (i,d,weak[d]))
+            self.caseOne.append((i,weak[d],d + 1))
+      self.logfile.write("===\n")
+    return
+
+  def calculate_case_weak(self,way):
+    if self.caseWeak[way]:
+      return
+    else:
+      self.caseWeak[way] = []
+      self.logfile.write("===\nCalculate child segments of weakly connected segment in direction %d\n" % way)
+      counters = [0] * (2**(self.dim + 1) - 1) * 3
+      onFace = [0] * 2**(self.dim)
+      for i in range(0,2**(self.dim-1)):
+        ilow = i & (2**way - 1)
+        ihi  = (i^ilow) & (2**self.dim - 1)
+        idx = ilow | (ihi << 1)
+        for j in range(idx,2**self.dim):
+          onFace[j] = i
+      for i in range(0,2**self.dim):
+        self.logfile.write("  If first cell has %d children in child segment ...\n" % (i + 1))
+        for j in range(0,2**self.dim):
+          self.logfile.write("    ... and last cell has %d children in child segment ..." % (j + 1))
+          childLength = i + j + 2
+          iOnFace = onFace[i]
+          jOnFace = 2**(self.dim - 1) - 1 - onFace[j]
+          if iOnFace < jOnFace:
+            self.logfile.write(" no first/last children are neighbors: disconnected\n")
+            counters[(childLength - 2) * 3 + 0] += 1
+          elif iOnFace == jOnFace and (not i & (2**way)) and (not j & (2**way)):
+            self.logfile.write(" only endpoint children are neighbors: weakly connected in direction %d\n" % way)
+            counters[(childLength - 2) * 3 + 2] += 1
+          else:
+            self.logfile.write(" first/last children other than endpoints are neighbors: strongly connected\n")
+            counters[(childLength - 2) * 3 + 1] += 1
+      for i in range(2,2**(self.dim+1) + 1):
+        idx = i - 2
+        self.logfile.write("  Total disconnected length %d child segments: %d\n" % (i,counters[idx * 3 + 0]))
+        self.caseWeak[way].append((i,counters[idx * 3 + 0],-1))
+        self.logfile.write("  Total strongly connected length %d child segments: %d\n" % (i,counters[idx * 3 + 1]))
+        self.caseWeak[way].append((i,counters[idx * 3 + 1],0))
+        self.logfile.write("  Total weakly connected length %d child segments in direction %d: %d\n" % (i,way,counters[idx * 3 + 2]))
+        self.caseWeak[way].append((i,counters[idx * 3 + 2],1+way))
+      self.logfile.write("===\n")
+    return
+
+  def child_tuple_insert(self,level,length,segtype,count,pad):
+    childTuple = (level,length,segtype)
+    if count:
+      self.logfile.write(pad + "Adding %d segments to (level = %d, length = %d, type = %s)\n" % (count,level,length,self.typestr[segtype]))
+    if childTuple in self.table:
+      self.table[childTuple] += count
+    else:
+      self.table[childTuple] = count
+    return
+
+  def enumerate_internal(self,level,length,segtype,pad):
+
+    assert length > 0
+    assert length <= 2**(self.dim*level)
+
+    nextpad = pad + ' '
+
+    if (level,length,segtype) in self.calculated:
+      return self.table[(level,length,segtype)]
+    else:
+      self.logfile.write(pad + 'Calculating segments on level %d with length %d of type %s ...\n' % (level,length,self.typestr[segtype]))
+      if (level,length,segtype) not in self.table:
+        self.table[(level,length,segtype)] = 0
+      if level == 0:
+        if segtype == 0:
+          self.table[(level,length,segtype)] = 1
+      else:
+        # figure out which ancestor segments could add to me
+        minNumParents = ((length - 1) / 2**(self.dim)) + 1
+        remainder = length % 2**(self.dim)
+        if remainder <= 1:
+          maxNumParents = length/2**(self.dim) + 1
+        else:
+          maxNumParents = length/2**(self.dim) + 2
+        maxNumParents = min(maxNumParents,2**(self.dim*(level - 1)))
+        for numParents in range(minNumParents,maxNumParents+1):
+          self.enumerate_internal(level-1,numParents,-1,nextpad)
+          self.enumerate_internal(level-1,numParents, 0,nextpad)
+          for d in range(0,self.dim):
+            self.enumerate_internal(level-1,numParents,1+d,nextpad)
+
+      self.calculated[(level,length,segtype)] = True
+      count = self.table[(level,length,segtype)]
+      self.logfile.write(pad + '... counted %d segments on level %d with length %d of type %s\n' % (count,level,length,self.typestr[segtype]))
+      if count:
+        if length == 1:
+          self.logfile.write(pad + 'Adding to descendant counts by multiplying unitary segment counts\n')
+          self.calculate_case_one()
+          for (childLength,childCount,childType) in self.caseOne:
+            childAdd = count * childCount;
+            self.child_tuple_insert(level + 1,childLength,childType,childAdd,nextpad)
+        else:
+          if segtype == -1:
+            self.logfile.write(pad + 'Adding to disconnected descendant counts\n')
+            for i in range(2,2**(self.dim + 1) + 1):
+              childLength = (length - 2) * 2**(self.dim) + i
+              childAdd    = count * min(i - 1,2**(self.dim + 1) + 1 - i)
+              self.child_tuple_insert(level + 1,childLength,-1,childAdd,nextpad)
+          elif segtype == 0:
+            self.logfile.write(pad + 'Adding to strongly connected descendant counts\n')
+            for i in range(2,2**(self.dim + 1) + 1):
+              childLength = (length - 2) * 2**(self.dim) + i
+              childAdd    = count * min(i - 1,2**(self.dim + 1) + 1 - i)
+              self.child_tuple_insert(level + 1,childLength,0,childAdd,nextpad)
+          else:
+            self.logfile.write(pad + 'Adding to descendant counts by multiplying %s counts\n' % self.typestr[segtype])
+            self.calculate_case_weak(segtype - 1)
+            for (childLength,childCount,childType) in self.caseWeak[segtype - 1]:
+              childAdd = count * childCount
+              self.child_tuple_insert(level + 1,childLength + 2**(self.dim) * (length - 2),childType,childAdd,nextpad)
+      return count
+
+  def enumerate(self,level,length):
+
+    self.logfile.write("Calculating counts of all types for length %d segments on level %d\n" % (level, length))
+    count = [0] * (self.dim + 2)
+    for segtype in range(-1,self.dim+1):
+      self.logfile.write("***\nEntering recursion to count %s segments\n" % self.typestr[segtype])
+      count[segtype + 1] = self.enumerate_internal(level,length,segtype,' ')
+      self.logfile.write("***\nExiting recursion to count %s segments, counted %d\n" % (self.typestr[segtype],count[segtype+1]))
+    assert sum(count) == 2**(self.dim*level) + 1 - length
+    return count
+
+def pairs(s):
+  try:
+    level, length = map(int, s.split(','))
+    if length <= 0:
+      raise argparse.ArgumentTypeError("length %s not allowed: must be positive" % length)
+    return (level,length)
+  except argparse.ArgumentTypeError:
+    raise
+  except:
+    raise argparse.ArgumentTypeError("segments must be level,length")
+
+if __name__ == '__main__':
+
+  parser = argparse.ArgumentParser(description='Enumerate continuous and discontinuous Morton curve segments')
+  parser.add_argument('--dimension', '-d', help='spatial dimension', type=int, default=2)
+  parser.add_argument('--segments', '-s', help='list of (level, length) segments to calculate',type=pairs,nargs='*',default=[(0,1)])
+  parser.add_argument('--verbose', '-v', help='verbose output (optional filename)',nargs='?', type=argparse.FileType('w'), default=os.devnull)
+  parser.add_argument('--pgfplots', '-p', help='pgfplots-style output (incompatible with --verbose)',action='store_true')
+  parser.add_argument('--random', '-r', help='add (level, N): add N randomly chosen segment lengths on level',type=pairs,nargs='*',default=[])
+  args = parser.parse_args()
+
+  for pair in args.random:
+    import random
+
+    level = pair[0]
+    N = pair[1]
+    total = 2**(args.dimension*level)
+    for i in range(0,N):
+      j = int(2**(random.random()*args.dimension*level))
+      while [level,j] in args.segments:
+        j = int(2**(random.random()*args.dimension*level))
+      args.segments.append([level,j])
+
+
+  if not args.verbose:
+    args.verbose = sys.stdout
+
+  table = MortonSegmentTable(args.dimension,args.verbose)
+
+  if args.pgfplots:
+    print "dimension level length continuous discontinuous contfrac"
+    print "# " + " ".join(sys.argv)
+  
+  for (level,length) in args.segments:
+    counts = table.enumerate(level,length)
+    if args.pgfplots:
+      logstring = "%d %d %d %d %d %f" % (table.dim, level, length, sum(counts[1:]), counts[0],float(sum(counts[1:]))/float(sum(counts)))
+    else:
+      logstring = "%d-D Morton curve with refinement level %d has %d segments of length %d:\n  %d continuous segments and %d discontinuous segments\n  (%f%%,%f%%)" % (table.dim, level, sum(counts), length, sum(counts[1:]), counts[0],100.*float(sum(counts[1:]))/float(sum(counts)),100.*float(counts[0])/float(sum(counts)))
+    print logstring
+    if table.logfile is not sys.stdout:
+      table.logfile.write("%s\n" % logstring)
diff --git a/doc/octreemap.pdf b/doc/octreemap.pdf
new file mode 100644
index 0000000..89d8663
Binary files /dev/null and b/doc/octreemap.pdf differ
diff --git a/doc/p4est-howto.pdf b/doc/p4est-howto.pdf
new file mode 100644
index 0000000..fced282
Binary files /dev/null and b/doc/p4est-howto.pdf differ
diff --git a/doc/p4est_license_approval.pdf b/doc/p4est_license_approval.pdf
new file mode 100644
index 0000000..6bd0875
Binary files /dev/null and b/doc/p4est_license_approval.pdf differ
diff --git a/doc/webpage/index.html b/doc/webpage/index.html
index f48b8fe..7e04c59 100644
--- a/doc/webpage/index.html
+++ b/doc/webpage/index.html
@@ -4,6 +4,10 @@
 on Forests of Octrees</title>
 <meta name="author" content="Carsten Burstedde">
 <link type="text/css" rel="stylesheet" href="p4est.css">
+<!-- mathjax !-->
+<script type="text/javascript"
+  src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
+</script>
 </head>
 <body>
 
@@ -23,22 +27,27 @@ later version.
 <p class="text">
 Please see the <a href="http://github.com/cburstedde/p4est">github repository</a>
 of <tt>p4est</tt> or download the
-<a href="http://p4est.github.io/release/p4est-1.0.tar.gz">latest release tarball</a>.
+<a href="http://p4est.github.io/release/p4est-1.1.tar.gz">latest release tarball</a>.
 The source comes with commented example programs and test cases.
-You can also download the old stable
-<a href="http://p4est.github.io/release/p4est-0.3.4.2.tar.gz">version 0.3.4.2</a>
-that is known to work with <a href="http://www.dealii.org/">deal.II</a>,
-or have a look at
-<a href="https://github.com/p4est/p4est.github.io/tree/master/release/">other
-official releases</a>.
+You can also download
+<a href="https://github.com/p4est/p4est.github.io/tree/master/release/">older
+stable releases</a>.
+Please note that the so-called releases auto-generated by github do
+<em>not</em> work (they are lacking the subdirectory sc and
+some generated files).
 </p>
-<!--
+</div>
+
+ 
+
+<div class="citation">
+<h3>Binary packages</h3>
 <p class="text">
-This is the <a href="http://github.com/cburstedde/libsc">github repository</a>
-for the <tt>sc</tt> auxiliary library.  A tarball is not necessary since <tt>sc</tt>
-is included the <tt>p4est</tt> tarball.
+Contributed packages of <tt>p4est</tt> are available for <a
+href="http://www.gentoo.org/">Gentoo Linux</a> (these are also available on the
+<a href="http://www.dealii.org/download.html">deal.ii download page</a>) and
+the <a href="http://brew.sh/">Homebrew distribution</a>.
 </p>
-//-->
 </div>
 
  
@@ -58,7 +67,7 @@ You can recreate it with <tt>make doxygen</tt> after calling <tt>configure</tt>.
 <p class="text">
 This as a <a href="http://p4est.github.io/p4est-howto.pdf">howto document</a> that documents the basic
 interface design of <tt>p4est</tt> and comments on the
-<a href="http://github.com/cburstedde/p4est/tree/develop/example/steps/">
+<a href="http://github.com/cburstedde/p4est/tree/master/example/steps/">
 step-by-step examples</a> included in the source code.
 </p>
 </div>
@@ -68,18 +77,22 @@ step-by-step examples</a> included in the source code.
 <div class="citation">
 <h3>Questions / Get involved</h3>
 <p class="text">
-We appreciate comments, questions, issues reported, or suggestions for adding features.
-Please email us at <a href="mailto:p4est at librelist.com">p4est at librelist.com</a>.
-This mailing list is <a href="http://librelist.com/browser/p4est/">archived</a>;
-see <a href="http://librelist.com/">librelist.com</a> for details.
-Your first message will auto-subscribe you and provide instructions to proceed.
-<!-- (it may be necessary to resend your initial mail). -->
+We appreciate comments, bug reports, and suggestions for adding features.
+To this end, we recommend using the
+<a href="https://github.com/cburstedde/p4est/issues">issue tracker</a>.
+We will also consider
+<a href="https://github.com/cburstedde/p4est/pulls">pull requests</a>.
+For further questions, please email us at <a href="info at p4est.org">info at p4est.org</a>.
 </p>
 
  
 
 <p class="text">
-We will also consider pull requests posted on <a href="http://www.github.com/">github</a>.
+We had previously used a now-defunct mailing list that is
+<a href="http://librelist.com/browser/p4est/">archived</a>.
+<!-- see <a href="http://librelist.com/">librelist.com</a> for details. -->
+<!-- Your first message will auto-subscribe you and provide instructions to proceed. -->
+<!-- (it may be necessary to resend your initial mail). -->
 </p>
 </div>
 
@@ -102,9 +115,10 @@ Recently, the 2:1 balance algorithm has been rewritten; see
 
 <p>
 If you use <tt>p4est</tt> for your publications, please cite it as follows [1a].
-The reference [1b] is for people specifically using the topology iterator and/or the
-high-order node numbering.  [1c] is for people interested in the 2:1 balance
-details or the strong scaling limit.
+The reference [1b] is for people specifically using the topology iterator, the
+high-order node numbering, or the top-down search.
+[1c] is for people interested in the 2:1 balance details, the strong scaling
+limit and/or memory footprint.
 </p>
 <p class="cite">
 [1a]
@@ -129,15 +143,33 @@ pages 1103-1133
   doi = {10.1137/100791634}
 }
 </pre>
+
  
+
 <p class="cite">
 [1b]
 Tobin Isaac, Carsten Burstedde, Lucas C. Wilcox, and Omar Ghattas,<br>
-<em>Recursive algorithms for distributed forests of octrees.</em>
-Submitted
-(<a href="http://arxiv.org/abs/1406.0089">arXiv:1406.0089</a>), 2014.
+<em>Recursive algorithms for distributed forests of octrees.</em><br>
+Published in SIAM Journal on Scientific Computing 37 no. 5 (2015),
+pages C497-C531
+(<a href="http://p4est.github.io/papers/IsaacBursteddeWilcoxEtAl15.pdf">download</a>).
 </p>
+
+<pre class="bibtex">
+ at ARTICLE{IsaacBursteddeWilcoxEtAl15,
+  author = {Tobin Isaac and Carsten Burstedde and Lucas C. Wilcox and Omar Ghattas},
+  title = {Recursive algorithms for distributed forests of octrees},
+  journal = {SIAM Journal on Scientific Computing},
+  volume = {37},
+  number = {5},
+  pages = {C497--C531},
+  year = {2015},
+  doi = {10.1137/140970963}
+}
+</pre>
+
  
+
 <p class="cite">
 [1c]
 Tobin Isaac, Carsten Burstedde, and Omar Ghattas,<br>
@@ -147,8 +179,10 @@ Distributed Processing Symposium, 2012
 (<a href="http://p4est.github.io/papers/IsaacBursteddeGhattas12.pdf">download</a>).
 <br>
 <em>Errata:</em>
-In Algorithm 7, line 3 reads "for all o in R;" it should read "for all o in R
-and R^new."
+In Algorithm 7, line 3 reads
+\(\text{for all}\ o\in R\ \text{do}\);
+it should read
+\(\text{for all}\ o\in R\cup R^{\text{new}}\ \text{do}\).
 </p>
 </div>
 
@@ -237,20 +271,17 @@ The <tt>p4est</tt> authors:<br>
 <a href="http://burstedde.ins.uni-bonn.de/">Carsten Burstedde</a><br>
 <a href="http://lucaswilcox.com/">Lucas C. Wilcox</a><br>
 <a href="http://users.ices.utexas.edu/~tisaac/">Tobin Isaac</a><br>
-<!--
-Johann Rudi<br>
-Ethan Alan Hereth<br>
-Johannes Holke<br>
-Alex Fikl<br>
-Hannes Frank<br>
-//-->
+Thanks to our contributors!  Please see the
+<a href="http://github.com/cburstedde/p4est/tree/master/AUTHORS">AUTHORS</a>
+file for details.
 </p>
 
 <div class="citation">
 The development of <tt>p4est</tt> was partially supported by the US National
-Science Foundation (NSF Grants No. OCI-0749334, CCF-0427985, CNS-0540372,
-CNS-0619838, DMS-0724746, OPP-0941678) and the US Department of Energy (DOE
-Grants No. 06ER25782, 08ER25860, SC0002710).  The authors thank the Texas
+Science Foundation (NSF Grants No. CCF-0427985, CMMI-1028889, CNS-0540372,
+CNS-0619838, DMS-0724746, OCI-0749334, OPP-0941678) and the US Department of
+Energy (DOE Grants No. 06ER25782, 08ER25860, SC0002710).
+The authors thank the Texas
 Advanced Computing Center (TACC) for providing them with access to the Ranger
 supercomputer under NSF TeraGrid award MCA04N026, and the National Center for
 Computational Science (NCCS) for early-user access to the Jaguar Cray XT5
diff --git a/example/balance_seeds/balance_seeds2.c b/example/balance_seeds/balance_seeds2.c
index 7489c7f..2a6d898 100644
--- a/example/balance_seeds/balance_seeds2.c
+++ b/example/balance_seeds/balance_seeds2.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -128,9 +129,9 @@ main (int argc, char **argv)
   p4est_quadrant_t   *q;
   int                 i;
 #ifndef P4_TO_P8
-  const char          filename[] = "p4est_balance_face";
+  char                filename[] = "p4est_balance_face";
 #else
-  const char          filename[] = "p8est_balance_edge";
+  char                filename[] = "p8est_balance_edge";
 #endif
 
   /* initialize MPI */
@@ -156,8 +157,11 @@ main (int argc, char **argv)
 
   p4est_refine (p4est, 1, refine_fn, init_fn);
 
-  p4est_vtk_write_header (p4est, NULL, 1. - 2. * SC_EPS,
-                          0, 0, 0, 0, "level", NULL, filename);
+  p4est_vtk_context_t *context = p4est_vtk_context_new (p4est, filename);
+  p4est_vtk_context_set_scale (context, 1. - 2. * SC_EPS);
+  context = p4est_vtk_write_header (context);
+  SC_CHECK_ABORT (context != NULL, P4EST_STRING "_vtk: Error writing header");
+
   vtkvec = sc_dmatrix_new (p4est->local_num_quadrants, P4EST_CHILDREN);
   tree = p4est_tree_array_index (p4est->trees, 0);
   quadrants = &(tree->quadrants);
@@ -169,8 +173,17 @@ main (int argc, char **argv)
         ((balance_seeds_elem_t *) (q->p.user_data))->flag;
     }
   }
-  p4est_vtk_write_point_scalar (p4est, NULL, filename, "level", vtkvec->e[0]);
-  p4est_vtk_write_footer (p4est, filename);
+  sc_array_t         *level =
+    sc_array_new_data ((void *) vtkvec->e[0], sizeof (double),
+                       count * P4EST_CHILDREN);
+  context =
+    p4est_vtk_write_point_dataf (context, 1, 0, "level", level, context);
+  SC_CHECK_ABORT (context != NULL,
+                  P4EST_STRING "_vtk: Error writing point data");
+  sc_array_destroy (level);
+
+  const int           retval = p4est_vtk_write_footer (context);
+  SC_CHECK_ABORT (!retval, P4EST_STRING "_vtk: Error writing footer");
 
   sc_dmatrix_destroy (vtkvec);
   p4est_destroy (p4est);
diff --git a/example/balance_seeds/balance_seeds3.c b/example/balance_seeds/balance_seeds3.c
index 948d1d8..9acafe6 100644
--- a/example/balance_seeds/balance_seeds3.c
+++ b/example/balance_seeds/balance_seeds3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/mesh/mesh2.c b/example/mesh/mesh2.c
index f678762..830cad8 100644
--- a/example/mesh/mesh2.c
+++ b/example/mesh/mesh2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -53,7 +54,9 @@ typedef enum
   P4EST_CONFIG_MOEBIUS,
   P4EST_CONFIG_STAR,
   P4EST_CONFIG_PERIODIC,
-  P4EST_CONFIG_ROTWRAP
+  P4EST_CONFIG_ROTWRAP,
+  P4EST_CONFIG_CUBED,
+  P4EST_CONFIG_DISK
 #else
   P8EST_CONFIG_UNIT,
   P8EST_CONFIG_PERIODIC,
@@ -127,6 +130,68 @@ refine_normal (p4est_t * p4est, p4est_topidx_t which_tree,
   return 1;
 }
 
+#if 0
+
+static void
+hack_test (mpi_context_t * mpi, p4est_connectivity_t * connectivity)
+{
+  int                 i;
+  int8_t              cc;
+  p4est_topidx_t      tt;
+  p4est_locidx_t      lnq, lng, lnc, lnco;
+  p4est_locidx_t      li, qtc;
+  p4est_locidx_t      co0, co1, coi, cq;
+  p4est_t            *p4est;
+  p4est_ghost_t      *ghost;
+  p4est_mesh_t       *mesh;
+
+  p4est = p4est_new_ext (mpi->mpicomm, connectivity, 0,
+                         refine_level, 1, 0, NULL, NULL);
+  p4est_vtk_write_file (p4est, NULL, "mesh_hack");
+
+  ghost = p4est_ghost_new (p4est, P4EST_CONNECT_FULL);
+  mesh = p4est_mesh_new_ext (p4est, ghost, 1, 1, P4EST_CONNECT_FULL);
+
+  lnq = mesh->local_num_quadrants;
+  lng = mesh->ghost_num_quadrants;
+  lnco = lnq + lng;
+  lnc = mesh->local_num_corners;
+  P4EST_LDEBUGF ("Local quads %lld corners %lld array %lld\n",
+                 (long long) lnq, (long long) lnc,
+                 (long long) mesh->corner_offset->elem_count);
+  for (li = 0; li < lnq; ++li) {
+    tt = mesh->quad_to_tree[li];
+    if (tt >= 2) {
+      /* break; */
+    }
+    for (i = 0; i < P4EST_CHILDREN; ++i) {
+      qtc = mesh->quad_to_corner[P4EST_CHILDREN * li + i];
+      if (qtc >= lnco) {
+        P4EST_LDEBUGF ("Quad %lld tree %lld Corner %d is %lld\n",
+                       (long long) li, (long long) tt, i, (long long) qtc);
+        if (qtc >= lnco) {
+          qtc -= lnco;
+          co0 = *(p4est_locidx_t *) sc_array_index (mesh->corner_offset, qtc);
+          co1 =
+            *(p4est_locidx_t *) sc_array_index (mesh->corner_offset, qtc + 1);
+          for (coi = co0; coi < co1; ++coi) {
+            cq = *(p4est_locidx_t *) sc_array_index (mesh->corner_quad, coi);
+            cc = *(int8_t *) sc_array_index (mesh->corner_corner, coi);
+            P4EST_LDEBUGF ("   Part %d quad %lld corner %d\n",
+                           (int) (coi - co0), (long long) cq, (int) cc);
+          }
+        }
+      }
+    }
+  }
+
+  p4est_mesh_destroy (mesh);
+  p4est_ghost_destroy (ghost);
+  p4est_destroy (p4est);
+}
+
+#endif
+
 static void
 test_mesh (p4est_t * p4est, p4est_ghost_t * ghost, p4est_mesh_t * mesh,
            int compute_tree_index, int compute_level_lists,
@@ -342,7 +407,7 @@ main (int argc, char **argv)
   usage =
     "Arguments: <configuration> <level>\n   Configuration can be any of\n"
 #ifndef P4_TO_P8
-    "      unit|three|moebius|star|periodic|rotwrap\n"
+    "      unit|three|moebius|star|periodic|rotwrap|cubed|disk\n"
 #else
     "      unit|periodic|rotwrap|twocubes|twowrap|rotcubes|shell|sphere\n"
 #endif
@@ -376,6 +441,12 @@ main (int argc, char **argv)
     else if (!strcmp (argv[1], "rotwrap")) {
       config = P4EST_CONFIG_ROTWRAP;
     }
+    else if (!strcmp (argv[1], "cubed")) {
+      config = P4EST_CONFIG_CUBED;
+    }
+    else if (!strcmp (argv[1], "disk")) {
+      config = P4EST_CONFIG_DISK;
+    }
 #else
     else if (!strcmp (argv[1], "periodic")) {
       config = P8EST_CONFIG_PERIODIC;
@@ -430,6 +501,12 @@ main (int argc, char **argv)
   else if (config == P4EST_CONFIG_ROTWRAP) {
     connectivity = p4est_connectivity_new_rotwrap ();
   }
+  else if (config == P4EST_CONFIG_CUBED) {
+    connectivity = p4est_connectivity_new_cubed ();
+  }
+  else if (config == P4EST_CONFIG_DISK) {
+    connectivity = p4est_connectivity_new_disk ();
+  }
 #else
   else if (config == P8EST_CONFIG_PERIODIC) {
     connectivity = p8est_connectivity_new_periodic ();
@@ -461,11 +538,16 @@ main (int argc, char **argv)
 #endif
   }
 
+#if 0
+  /* hack test */
+  hack_test (mpi, connectivity);
+#else
   /* run mesh tests */
   mesh_run (mpi, connectivity, 1, 0, 1, P4EST_CONNECT_FULL);
   mesh_run (mpi, connectivity, 0, 1, 0, P4EST_CONNECT_FULL);
   mesh_run (mpi, connectivity, 0, 0, 0, P4EST_CONNECT_FACE);
   mesh_run (mpi, connectivity, 1, 1, 1, P4EST_CONNECT_FACE);
+#endif
 
   /* clean up and exit */
   p4est_connectivity_destroy (connectivity);
diff --git a/example/mesh/mesh3.c b/example/mesh/mesh3.c
index fdefa5b..803b983 100644
--- a/example/mesh/mesh3.c
+++ b/example/mesh/mesh3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/mesh/periodicity3.c b/example/mesh/periodicity3.c
index 58e663b..2116cc4 100644
--- a/example/mesh/periodicity3.c
+++ b/example/mesh/periodicity3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/p6est/Makefile.am b/example/p6est/Makefile.am
deleted file mode 100644
index 65cd860..0000000
--- a/example/p6est/Makefile.am
+++ /dev/null
@@ -1,25 +0,0 @@
-
-if P4EST_ENABLE_BUILD_P6EST
-libp4est_installed_headers += \
-        example/p6est/p6est.h example/p6est/p6est_ghost.h example/p6est/p6est_lnodes.h \
-        example/p6est/p6est_profile.h example/p6est/p6est_vtk.h example/p6est/p6est_io.h \
-        example/p6est/p6est_extended.h
-libp4est_compiled_sources += \
-        example/p6est/p6est.c example/p6est/p6est_ghost.c example/p6est/p6est_lnodes.c \
-        example/p6est/p6est_profile.c example/p6est/p6est_vtk.c example/p6est/p6est_io.c
-AM_CPPFLAGS += -I at top_srcdir@/example/p6est
-
-p6est_test_programs = example/p6est/test/p6est_test_all
-
-example_p6est_test_p6est_test_all_SOURCES = example/p6est/test/test_all.c
-TESTS += $(p6est_test_programs)
-check_PROGRAMS += $(p6est_test_programs)
-
-# for now -- make it easier for batch builds
-bin_PROGRAMS += $(p6est_test_programs)
-endif
-
-clean-local-p6est:
-	rm -f *.p6p
-
-.PHONY: clean-local-p6est
diff --git a/example/p6est/p6est_io.c b/example/p6est/p6est_io.c
deleted file mode 100644
index e69de29..0000000
diff --git a/example/p6est/p6est_io.h b/example/p6est/p6est_io.h
deleted file mode 100644
index e69de29..0000000
diff --git a/example/points/points2.c b/example/points/points2.c
index 5d3a4e4..d836a79 100644
--- a/example/points/points2.c
+++ b/example/points/points2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/points/points3.c b/example/points/points3.c
index bb50664..d2402fc 100644
--- a/example/points/points3.c
+++ b/example/points/points3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/simple/simple2.c b/example/simple/simple2.c
index b1132ef..87f59f4 100644
--- a/example/simple/simple2.c
+++ b/example/simple/simple2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/simple/simple3.c b/example/simple/simple3.c
index 66145fc..26bf9ee 100644
--- a/example/simple/simple3.c
+++ b/example/simple/simple3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/steps/p4est_step1.c b/example/steps/p4est_step1.c
index d91b7e4..1f53b6d 100644
--- a/example/steps/p4est_step1.c
+++ b/example/steps/p4est_step1.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/steps/p4est_step2.c b/example/steps/p4est_step2.c
index e376d35..fd0363e 100644
--- a/example/steps/p4est_step2.c
+++ b/example/steps/p4est_step2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/steps/p4est_step3.c b/example/steps/p4est_step3.c
index 20c82ef..0f4aa9c 100644
--- a/example/steps/p4est_step3.c
+++ b/example/steps/p4est_step3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -48,6 +49,9 @@
 #include <p8est_iterate.h>
 #endif
 
+/** We had 1. / 0. here to create a NaN but that is not portable. */
+static const double step3_invalid = -1.;
+
 /* In this example we store data with each quadrant/octant. */
 
 /** Per-quadrant data for this example.
@@ -402,7 +406,7 @@ step3_replace_quads (p4est_t * p4est, p4est_topidx_t which_tree,
     parent_data = (step3_data_t *) incoming[0]->p.user_data;
     parent_data->u = 0.;
     for (j = 0; j < P4EST_DIM; j++) {
-      parent_data->du[j] = (1. / 0.);
+      parent_data->du[j] = step3_invalid;
 
     }
     for (i = 0; i < P4EST_CHILDREN; i++) {
@@ -470,7 +474,7 @@ step3_replace_quads (p4est_t * p4est, p4est_topidx_t which_tree,
 static void
 step3_interpolate_solution (p4est_iter_volume_info_t * info, void *user_data)
 {
-  double             *u_interp = (double *) user_data;  /* we passed the array of values to fill as the user_data in the call to p4est_iterate */
+  sc_array_t         *u_interp = (sc_array_t *) user_data;      /* we passed the array of values to fill as the user_data in the call to p4est_iterate */
   p4est_t            *p4est = info->p4est;
   p4est_quadrant_t   *q = info->quad;
   p4est_topidx_t      which_tree = info->treeid;
@@ -480,6 +484,7 @@ step3_interpolate_solution (p4est_iter_volume_info_t * info, void *user_data)
   double              h;
   p4est_locidx_t      arrayoffset;
   double              this_u;
+  double             *this_u_ptr;
   int                 i, j;
 
   tree = p4est_tree_array_index (p4est->trees, which_tree);
@@ -499,7 +504,8 @@ step3_interpolate_solution (p4est_iter_volume_info_t * info, void *user_data)
        * the +y side, etc. */
       this_u += (h / 2) * data->du[j] * ((i & (1 << j)) ? 1. : -1.);
     }
-    u_interp[arrayoffset + i] = this_u;
+    this_u_ptr = (double *) sc_array_index (u_interp, arrayoffset + i);
+    this_u_ptr[0] = this_u;
   }
 
 }
@@ -513,7 +519,7 @@ static void
 step3_write_solution (p4est_t * p4est, int timestep)
 {
   char                filename[BUFSIZ] = { '\0' };
-  double             *u_interp;
+  sc_array_t         *u_interp;
   p4est_locidx_t      numquads;
 
   snprintf (filename, 17, P4EST_STRING "_step3_%04d", timestep);
@@ -522,7 +528,7 @@ step3_write_solution (p4est_t * p4est, int timestep)
 
   /* create a vector with one value for the corner of every local quadrant
    * (the number of children is always the same as the number of corners) */
-  u_interp = P4EST_ALLOC (double, numquads * P4EST_CHILDREN);
+  u_interp = sc_array_new_size (sizeof (double), numquads * P4EST_CHILDREN);
 
   /* Use the iterator to visit every cell and fill in the solution values.
    * Using the iterator is not absolutely necessary in this case: we could
@@ -531,24 +537,43 @@ step3_write_solution (p4est_t * p4est, int timestep)
    * the usage of p4est_iterate in this example */
   p4est_iterate (p4est, NULL,   /* we don't need any ghost quadrants for this loop */
                  (void *) u_interp,     /* pass in u_interp so that we can fill it */
-                 step3_interpolate_solution,    /* callback function that interpolate from the cell center to the cell corners, defined above */
+                 step3_interpolate_solution,    /* callback function that interpolates from the cell center to the cell corners, defined above */
                  NULL,          /* there is no callback for the faces between quadrants */
 #ifdef P4_TO_P8
                  NULL,          /* there is no callback for the edges between quadrants */
 #endif
                  NULL);         /* there is no callback for the corners between quadrants */
 
-  p4est_vtk_write_all (p4est, NULL,     /* we do not need to transform from the vertex space into physical space, so we do not need a p4est_geometry_t * pointer */
-                       0.99,    /* draw each quadrant at almost full scale */
-                       0,       /* do not write the tree id's of each quadrant (there is only one tree in this example) */
-                       1,       /* do write the refinement level of each quadrant */
-                       1,       /* do write the mpi process id of each quadrant */
-                       0,       /* do not wrap the mpi rank (if this were > 0, the modulus of the rank relative to this number would be written instead of the rank) */
-                       1,       /* write one scalar field: the solution value */
-                       0,       /* write no vector fields */
-                       filename, "solution", u_interp);
-
-  P4EST_FREE (u_interp);
+  /* create VTK output context and set its parameters */
+  p4est_vtk_context_t *context = p4est_vtk_context_new (p4est, filename);
+  p4est_vtk_context_set_scale (context, 0.99);  /* quadrant at almost full scale */
+
+  /* begin writing the output files */
+  context = p4est_vtk_write_header (context);
+  SC_CHECK_ABORT (context != NULL,
+                  P4EST_STRING "_vtk: Error writing vtk header");
+
+  /* do not write the tree id's of each quadrant
+   * (there is only one tree in this example) */
+  context = p4est_vtk_write_cell_dataf (context, 0, 1,  /* do write the refinement level of each quadrant */
+                                        1,      /* do write the mpi process id of each quadrant */
+                                        0,      /* do not wrap the mpi rank (if this were > 0, the modulus of the rank relative to this number would be written instead of the rank) */
+                                        0,      /* there is no custom cell scalar data. */
+                                        0,      /* there is no custom cell vector data. */
+                                        context);       /* mark the end of the variable cell data. */
+  SC_CHECK_ABORT (context != NULL,
+                  P4EST_STRING "_vtk: Error writing cell data");
+
+  /* write one scalar field: the solution value */
+  context = p4est_vtk_write_point_dataf (context, 1, 0, /* write no vector fields */
+                                         "solution", u_interp, context);        /* mark the end of the variable cell data. */
+  SC_CHECK_ABORT (context != NULL,
+                  P4EST_STRING "_vtk: Error writing cell data");
+
+  const int           retval = p4est_vtk_write_footer (context);
+  SC_CHECK_ABORT (!retval, P4EST_STRING "_vtk: Error writing footer");
+
+  sc_array_destroy (u_interp);
 }
 
 /** Approximate the divergence of (vu) on each quadrant
@@ -768,7 +793,7 @@ step3_reset_derivatives (p4est_iter_volume_info_t * info, void *user_data)
   int                 j;
 
   for (j = 0; j < P4EST_DIM; j++) {
-    data->du[j] = (1. / 0.);
+    data->du[j] = step3_invalid;
   }
 }
 
diff --git a/example/steps/p4est_step4.c b/example/steps/p4est_step4.c
index 706cd85..2751b58 100644
--- a/example/steps/p4est_step4.c
+++ b/example/steps/p4est_step4.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -260,11 +261,11 @@ share_sum (p4est_t * p4est, p4est_lnodes_t * lnodes, double *v)
   buffer = p4est_lnodes_share_all (&node_data, lnodes);
 
   for (iq = 0; iq < npeers; ++iq) {
-    lrank = (p4est_lnodes_rank_t *) sc_array_index_int (lnodes->sharers, iq);
     sc_array_t         *recv_data =
       (sc_array_t *) sc_array_index_int (buffer->recv_buffers, iq);
-    P4EST_ASSERT (recv_data->elem_size == node_data.elem_size);
 
+    P4EST_ASSERT (recv_data->elem_size == node_data.elem_size);
+    lrank = (p4est_lnodes_rank_t *) sc_array_index_int (lnodes->sharers, iq);
     if (lrank->rank != p4est->mpirank) {
       const int           nshared = (int) lrank->shared_nodes.elem_count;
       const double       *w = (const double *) recv_data->array;
diff --git a/example/steps/p8est_step1.c b/example/steps/p8est_step1.c
index 43ba347..877084c 100644
--- a/example/steps/p8est_step1.c
+++ b/example/steps/p8est_step1.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/steps/p8est_step2.c b/example/steps/p8est_step2.c
index 20b04f7..f11fb5e 100644
--- a/example/steps/p8est_step2.c
+++ b/example/steps/p8est_step2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/steps/p8est_step3.c b/example/steps/p8est_step3.c
index 3b39e67..090eb7e 100644
--- a/example/steps/p8est_step3.c
+++ b/example/steps/p8est_step3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/steps/p8est_step4.c b/example/steps/p8est_step4.c
index 872603c..ffe8e08 100644
--- a/example/steps/p8est_step4.c
+++ b/example/steps/p8est_step4.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/tetgen/read_conn2.c b/example/tetgen/read_conn2.c
index e6d3108..c92ed26 100644
--- a/example/tetgen/read_conn2.c
+++ b/example/tetgen/read_conn2.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/example/tetgen/read_conn3.c b/example/tetgen/read_conn3.c
index ae456a4..8612007 100644
--- a/example/tetgen/read_conn3.c
+++ b/example/tetgen/read_conn3.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/example/tetgen/read_tetgen.c b/example/tetgen/read_tetgen.c
index c797e76..88483df 100644
--- a/example/tetgen/read_tetgen.c
+++ b/example/tetgen/read_tetgen.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/tetgen/write_conn2.c b/example/tetgen/write_conn2.c
index 63b791d..5f20ae0 100644
--- a/example/tetgen/write_conn2.c
+++ b/example/tetgen/write_conn2.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/example/tetgen/write_conn3.c b/example/tetgen/write_conn3.c
index 62cef6c..a0d5b79 100644
--- a/example/tetgen/write_conn3.c
+++ b/example/tetgen/write_conn3.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/example/timings/bricks2.c b/example/timings/bricks2.c
index 438bdec..9730dca 100644
--- a/example/timings/bricks2.c
+++ b/example/timings/bricks2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/timings/bricks3.c b/example/timings/bricks3.c
index 9e3e1e5..8aed999 100644
--- a/example/timings/bricks3.c
+++ b/example/timings/bricks3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/timings/loadconn2.c b/example/timings/loadconn2.c
index 11be191..e5bcd39 100644
--- a/example/timings/loadconn2.c
+++ b/example/timings/loadconn2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/timings/loadconn3.c b/example/timings/loadconn3.c
index 50e643f..3e5ac97 100644
--- a/example/timings/loadconn3.c
+++ b/example/timings/loadconn3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/timings/timings2.c b/example/timings/timings2.c
index 6814850..5c7187b 100644
--- a/example/timings/timings2.c
+++ b/example/timings/timings2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -260,6 +261,7 @@ main (int argc, char **argv)
   int                 first_argc;
   int                 test_multiple_orders;
   int                 skip_nodes, skip_lnodes;
+  int                 repartition_lnodes;
 
   /* initialize MPI and p4est internals */
   mpiret = sc_MPI_Init (&argc, &argv);
@@ -316,6 +318,9 @@ main (int argc, char **argv)
                          "Also time lnodes for orders 2, 4, and 8");
   sc_options_add_switch (opt, 0, "skip-nodes", &skip_nodes, "Skip nodes");
   sc_options_add_switch (opt, 0, "skip-lnodes", &skip_lnodes, "Skip lnodes");
+  sc_options_add_switch (opt, 0, "repartition-lnodes",
+                         &repartition_lnodes,
+                         "Repartition to load-balance lnodes");
 
   first_argc = sc_options_parse (p4est_package_id, SC_LP_DEFAULT,
                                  opt, argc, argv);
@@ -485,7 +490,7 @@ main (int argc, char **argv)
           break;
         }
         P4EST_GLOBAL_PRODUCTIONF ("mpirun -np %3d %s%s -c %10s -l %2d\n",
-                                  r->mpisize, opt->program_path,
+                                  r->mpisize, P4EST_STRING "_timings",
                                   oldschool ? " --oldschool" : "",
                                   config_name, r->level);
       }
@@ -636,6 +641,12 @@ main (int argc, char **argv)
     p4est_nodes_destroy (nodes);
   }
 
+  if (!skip_lnodes && repartition_lnodes) {
+    p4est_partition_lnodes (p4est, ghost, 1, 0);
+    p4est_ghost_destroy (ghost);
+    ghost = p4est_ghost_new (p4est, P4EST_CONNECT_FULL);
+  }
+
   /* time the lnode numbering */
   if (!skip_lnodes) {
     sc_flops_snap (&fi, &snapshot);
@@ -649,12 +660,22 @@ main (int argc, char **argv)
   }
 
   if (test_multiple_orders) {
+    if (repartition_lnodes) {
+      p4est_partition_lnodes (p4est, ghost, 3, 0);
+      p4est_ghost_destroy (ghost);
+      ghost = p4est_ghost_new (p4est, P4EST_CONNECT_FULL);
+    }
     sc_flops_snap (&fi, &snapshot);
     lnodes = p4est_lnodes_new (p4est, ghost, 3);
     sc_flops_shot (&fi, &snapshot);
     sc_stats_set1 (&stats[TIMINGS_LNODES3], snapshot.iwtime, "L-Nodes 3");
     p4est_lnodes_destroy (lnodes);
 
+    if (repartition_lnodes) {
+      p4est_partition_lnodes (p4est, ghost, 7, 0);
+      p4est_ghost_destroy (ghost);
+      ghost = p4est_ghost_new (p4est, P4EST_CONNECT_FULL);
+    }
     sc_flops_snap (&fi, &snapshot);
     lnodes = p4est_lnodes_new (p4est, ghost, 7);
     sc_flops_shot (&fi, &snapshot);
diff --git a/example/timings/timings3.c b/example/timings/timings3.c
index 35154c9..7415043 100644
--- a/example/timings/timings3.c
+++ b/example/timings/timings3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/timings/tsearch3.c b/example/timings/tsearch3.c
index 5db5a6b..a541455 100644
--- a/example/timings/tsearch3.c
+++ b/example/timings/tsearch3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -43,6 +44,10 @@
 #include <sc_options.h>
 #include <sc_statistics.h>
 
+/** We had 1. / 0. here to create a NaN but that is not portable. */
+#define TSEARCH_INVALID (-1.)
+static const double tsearch_invalid = TSEARCH_INVALID;
+
 static int          refine_level, level_shift;
 
 typedef enum tsearch_stats
@@ -402,9 +407,9 @@ tsearch_setup (tsearch_global_t * tsg)
   int                 retval;
   double              nref[P4EST_DIM];
 #endif
-  double              bbox[3][2] = { {1. / 0., -1. / 0.},
-  {1. / 0., -1. / 0.},
-  {1. / 0., -1. / 0.}
+  double              bbox[3][2] = { {TSEARCH_INVALID, TSEARCH_INVALID},
+  {TSEARCH_INVALID, TSEARCH_INVALID},
+  {TSEARCH_INVALID, TSEARCH_INVALID}
   };
 
   mlen = 1. / P4EST_ROOT_LEN;
@@ -449,7 +454,7 @@ tsearch_setup (tsearch_global_t * tsg)
   }
   else {
     tsg->R2 = 0.;
-    tsg->r2 = 1. / 0.;
+    tsg->r2 = tsearch_invalid;
     for (k = 0; k < P4EST_CHILDREN; ++k) {
       p4est_quadrant_corner_node (q, k, &c);
       ref[0] = c.x * mlen - 1. + (tsg->which_tree & 1);
diff --git a/sc/AUTHORS b/sc/AUTHORS
index 71cec34..9aa7931 100644
--- a/sc/AUTHORS
+++ b/sc/AUTHORS
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -36,6 +37,9 @@
   Adrian Seyboldt <adrian.seyboldt at gmail.com>
   Sarah Weischer <s.weischer at gmx.de>
   Johannes Holke <holke at ins.uni-bonn.de>
+  Johann Rudi <johann at ices.utexas.edu>
+  Cristiano Malossi <acm at zurich.ibm.com>
+  Pierre Kestener <pierre.kestener at cea.fr>
 */
 
 /*
@@ -61,6 +65,12 @@
 */
 
 /*
+  The SC library is free software under the GNU LGPL v2.1 or later.
+  This is in agreement with the fact that some contributions have been
+  placed under the FreeBSD license and/or in the public domain.
+
+  For information on individual contributions see doc/author_*.txt.
+
   The files under src/sc_builtin are copied from the GNU C library.
   See copyright information below.
 
diff --git a/sc/Doxyfile.in b/sc/Doxyfile.in
index da6d7cb..e95179a 100644
--- a/sc/Doxyfile.in
+++ b/sc/Doxyfile.in
@@ -764,7 +764,7 @@ INPUT_ENCODING         = UTF-8
 # *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
 # *.qsf, *.as and *.js.
 
-FILE_PATTERNS          =
+FILE_PATTERNS          = *.h
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
diff --git a/sc/Makefile.am b/sc/Makefile.am
index c8c4463..12b7d57 100644
--- a/sc/Makefile.am
+++ b/sc/Makefile.am
@@ -67,7 +67,9 @@ include example/function/Makefile.am
 include example/logging/Makefile.am
 include example/options/Makefile.am
 include example/pthread/Makefile.am
+include example/openmp/Makefile.am
 include example/warp/Makefile.am
+include example/testing/Makefile.am
 
 # lint static syntax checker
 ALL_LINT_FLAGS = $(LINT_FLAGS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
@@ -92,8 +94,10 @@ ChangeLog:
 dist-hook:
 	echo $(VERSION) > $(distdir)/.tarball-version
 	test "x$(VERSION)" = "x`@top_srcdir@/build-aux/git-version-gen\
-              @top_srcdir@/.tarball-version`" || \
-        ((echo "Stale version;" ; echo "Please run:" ; \
+              $(distdir)/.tarball-version`" || \
+        ((echo "Stale version;" ; echo $(VERSION); \
+          echo `@top_srcdir@/build-aux/git-version-gen\
+              $(distdir)/.tarball-version`; echo "Please run:" ; \
           echo "     cd @top_srcdir@ && ./bootstrap" ; \
           echo "before make dist") 1>&2 ; rm -r $(distdir) ; exit 1)
 
diff --git a/sc/README b/sc/README
index 4f29ab4..f0a2ee1 100644
--- a/sc/README
+++ b/sc/README
@@ -3,12 +3,14 @@ This is the README file for libsc.
 
 The SC Library provides support for parallel scientific applications.
 
-Copyright (C) 2010 The University of Texas System.
-libsc is written by Carsten Burstedde, Lucas C. Wilcox, Tobin Isaac, and others
-and released under the GNU Lesser General Public Licence version 2.1 (or, at
-your option, any later version).
+Copyright (C) 2010 The University of Texas System
+Additional copyright (C) 2011 individual authors
+
+libsc is written by Carsten Burstedde, Lucas C. Wilcox, Tobin Isaac, and
+others.  libsc is free software released under the GNU Lesser General Public
+Licence version 2.1 (or, at your option, any later version).
 
 The official web page for source code and documentation is www.p4est.org.
-Please send bug reports and ideas for contribution to p4est at librelist.com.
+Please send bug reports and ideas for contribution to info at p4est.org.
 
 The build instructions for p4est also apply to standalone builds of libsc.
diff --git a/sc/config/sc_include.m4 b/sc/config/sc_include.m4
index 4d8e347..b77a84e 100644
--- a/sc/config/sc_include.m4
+++ b/sc/config/sc_include.m4
@@ -270,6 +270,8 @@ SC_CHECK_LIB([lua52 lua5.2 lua51 lua5.1 lua lua5], [lua_createtable],
 SC_CHECK_BLAS_LAPACK([$1])
 SC_BUILTIN_ALL_PREFIX([$1])
 SC_CHECK_PTHREAD([$1])
+SC_CHECK_OPENMP([$1])
+SC_CHECK_MEMALIGN([$1])
 dnl SC_CUDA([$1])
 ])
 
diff --git a/sc/config/sc_memalign.m4 b/sc/config/sc_memalign.m4
new file mode 100644
index 0000000..be3db68
--- /dev/null
+++ b/sc/config/sc_memalign.m4
@@ -0,0 +1,96 @@
+
+dnl SC_CHECK_MEMALIGN(PREFIX)
+dnl Let the user specify --enable-memalign[=X] or --disable-memalign.
+dnl The alignment argument X must be a multiple of sizeof (void *).
+dnl
+dnl The default is --enable-memalign, which sets X to (sizeof (void *)).
+dnl
+dnl This macro also searches for the aligned allocation functions aligned_alloc
+dnl (C11 / glibc >= 2.16) and posix_memalign (POSIX / glibc >= 2.1.91) and
+dnl defines SC_HAVE_ALIGNED_ALLOC and SC_HAVE_POSIX_MEMALIGN, respectively.
+dnl If found and alignment is enabled, this macro runs the link tests.
+dnl
+dnl If memory alignment is selected, the sc_malloc calls and friends will
+dnl use the aligned version, relying on posix_memalign if it exists.
+dnl
+AC_DEFUN([SC_CHECK_MEMALIGN], [
+
+dnl check for size of types
+AC_CHECK_SIZEOF([void *])
+
+dnl check for presence of functions
+AC_CHECK_FUNCS([aligned_alloc posix_memalign])
+
+dnl custom memory alignment option
+AC_MSG_CHECKING([for memory alignment option])
+SC_ARG_DISABLE_PREFIX([memalign],
+  [use aligned malloc (optionally use --enable-memalign=<bytes>)],
+  [MEMALIGN], [$1])
+
+dnl read the value of the configuration argument
+if test "x$$1_ENABLE_MEMALIGN" != xno ; then
+  if test "x$$1_ENABLE_MEMALIGN" != xyes ; then
+
+    dnl make sure the alignment is a number 
+    $1_MEMALIGN_BYTES=`echo "$$1_ENABLE_MEMALIGN" | tr -c -d '[[:digit:]]'`
+    $1_MEMALIGN_BYTES_LINK="$$1_MEMALIGN_BYTES"
+    if test "x$$1_MEMALIGN_BYTES" = x ; then
+      AC_MSG_ERROR([Please provide --enable-memalign with a numeric value or nothing])
+    fi
+  else
+    $1_MEMALIGN_BYTES_LINK="SIZEOF_VOID_P"
+    $1_MEMALIGN_BYTES="$1_$$1_MEMALIGN_BYTES_LINK"
+  fi
+  AC_DEFINE_UNQUOTED([MEMALIGN_BYTES], [($$1_MEMALIGN_BYTES)],
+                     [desired alignment of allocations in bytes])
+  AC_MSG_RESULT([$$1_MEMALIGN_BYTES])
+
+dnl verify that aligned_alloc can be linked against
+  if test "x$ac_cv_func_aligned_alloc" = xyes ; then
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <stdlib.h>]],
+[[
+int *a = (int *) aligned_alloc ($$1_MEMALIGN_BYTES_LINK, 3 * sizeof(*a));
+free(a);
+]])],
+                   [], [AC_MSG_ERROR([Linking aligned_alloc failed])])
+  fi
+
+dnl verify that posix_memalign can be linked against
+  if test "x$ac_cv_func_posix_memalign" = xyes ; then
+    AC_LINK_IFELSE([AC_LANG_PROGRAM(
+[[
+#include<stdlib.h>
+#include<errno.h>
+]],
+[[
+int *a;
+int err = posix_memalign((void **) &a, $$1_MEMALIGN_BYTES_LINK, 3 * sizeof(*a));
+free(a);
+]])],
+                   [], [AC_MSG_ERROR([Linking posix_memalign failed])])
+  fi
+
+dnl the function memalign is obsolete and not used
+dnl   $1_WITH_MEMALIGN="yes"
+dnl   AC_SEARCH_LIBS([memalign])
+dnl   if test "x$ac_cv_search_memalign" != "xnone required" ; then
+dnl     $1_WITH_MEMALIGN=no
+dnl   fi
+dnl   if test "x$$1_WITH_MEMALIGN" = xyes ; then
+dnl     AC_LINK_IFELSE([AC_LANG_PROGRAM(
+dnl [[
+dnl #include<stdlib.h>
+dnl ]],
+dnl [[
+dnl int *a = (int *) memalign($$1_MEMALIGN_BYTES,3*sizeof(*a));
+dnl free(a);
+dnl ]])],
+dnl                    [],[$1_WITH_MEMALIGN=no])
+dnl   fi
+dnl   if test "x$$1_WITH_MEMALIGN" = xyes ; then
+dnl     AC_DEFINE([WITH_MEMALIGN],[1],[define to 1 if memalign() found])
+dnl   fi
+else
+  AC_MSG_RESULT([not used])
+fi
+])
diff --git a/sc/config/sc_mpi.m4 b/sc/config/sc_mpi.m4
index 4e66e36..5c5e359 100644
--- a/sc/config/sc_mpi.m4
+++ b/sc/config/sc_mpi.m4
@@ -307,6 +307,59 @@ mpiret = MPI_Finalize ();
  $2])
 ])
 
+dnl SC_MPIWINSHARED_C_COMPILE_AND_LINK([action-if-successful], [action-if-failed])
+dnl Compile and link an MPI_Win_allocate_shared test program
+dnl
+AC_DEFUN([SC_MPIWINSHARED_C_COMPILE_AND_LINK],
+[
+AC_MSG_CHECKING([compile/link for MPI_Win_allocate_shared C program])
+AC_LINK_IFELSE([AC_LANG_PROGRAM(
+[[
+#undef MPI
+#include <mpi.h>
+]], [[
+int mpiret;
+int mpithr;
+int disp_unit=0;
+char *baseptr;
+MPI_Win win;
+MPI_Init ((int *) 0, (char ***) 0);
+mpiret = MPI_Win_allocate_shared(0,disp_unit,MPI_INFO_NULL,MPI_COMM_WORLD,(void *) &baseptr,&win);
+mpiret = MPI_Win_shared_query(win,0,0,&disp_unit,(void *) &baseptr);
+mpiret = MPI_Win_lock(MPI_LOCK_EXCLUSIVE,0,MPI_MODE_NOCHECK,win);
+mpiret = MPI_Win_unlock(0,win);
+mpiret = MPI_Win_free(&win);
+mpiret = MPI_Finalize ();
+]])],
+[AC_MSG_RESULT([successful])
+ $1],
+[AC_MSG_RESULT([failed])
+ $2])
+])
+
+dnl SC_MPICOMMSHARED_C_COMPILE_AND_LINK([action-if-successful], [action-if-failed])
+dnl Compile and link an MPI_COMM_TYPE_SHARED test program
+dnl
+AC_DEFUN([SC_MPICOMMSHARED_C_COMPILE_AND_LINK],
+[
+AC_MSG_CHECKING([compile/link for MPI_COMM_TYPE_SHARED C program])
+AC_LINK_IFELSE([AC_LANG_PROGRAM(
+[[
+#undef MPI
+#include <mpi.h>
+]], [[
+int mpiret;
+MPI_Comm subcomm;
+MPI_Init ((int *) 0, (char ***) 0);
+mpiret = MPI_Comm_split_type(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,&subcomm);
+mpiret = MPI_Finalize ();
+]])],
+[AC_MSG_RESULT([successful])
+ $1],
+[AC_MSG_RESULT([failed])
+ $2])
+])
+
 dnl SC_MPI_INCLUDES
 dnl Call the compiler with various --show* options
 dnl to figure out the MPI_INCLUDES and MPI_INCLUDE_PATH varables
@@ -393,6 +446,16 @@ dnl  ])
     SC_MPITHREAD_C_COMPILE_AND_LINK(,
       [AC_MSG_ERROR([MPI_Init_thread not found; you may try --disable-mpithread])])
   fi
+  $1_ENABLE_MPIWINSHARED=yes
+  SC_MPIWINSHARED_C_COMPILE_AND_LINK(,[$1_ENABLE_MPIWINSHARED=no])
+  if test "x$$1_ENABLE_MPIWINSHARED" = xyes ; then
+    AC_DEFINE([ENABLE_MPIWINSHARED], 1, [Define to 1 if we can use MPI_Win_allocate_shared])
+  fi
+  $1_ENABLE_MPICOMMSHARED=yes
+  SC_MPICOMMSHARED_C_COMPILE_AND_LINK(,[$1_ENABLE_MPICOMMSHARED=no])
+  if test "x$$1_ENABLE_MPICOMMSHARED" = xyes ; then
+    AC_DEFINE([ENABLE_MPICOMMSHARED], 1, [Define to 1 if we can use MPI_COMM_TYPE_SHARED])
+  fi
 fi
 
 dnl figure out the MPI include directories
diff --git a/sc/config/sc_openmp.m4 b/sc/config/sc_openmp.m4
new file mode 100644
index 0000000..3a76d3a
--- /dev/null
+++ b/sc/config/sc_openmp.m4
@@ -0,0 +1,49 @@
+
+dnl SC_CHECK_OPENMP(PREFIX)
+dnl Check for OpenMP support and link a test program
+dnl
+dnl This macro tries to link to omp_get_thread_num both as is and with -lgomp.
+dnl If neither of this works, we throw an error.
+dnl Use the LIBS variable on the configure line to specify a different library.
+dnl
+dnl Using --enable-openmp without any argument defaults to -fopenmp.
+dnl For different CFLAGS use --enable-openmp="-my-openmp-cflags" or similar.
+dnl
+AC_DEFUN([SC_CHECK_OPENMP], [
+
+dnl This link test changes the LIBS variable in place for posterity
+dnl SAVE_LIBS="$LIBS"
+SC_CHECK_LIB([gomp], [omp_get_thread_num], [OPENMP], [$1])
+dnl LIBS="$SAVE_LIBS"
+AC_MSG_CHECKING([for OpenMP])
+
+SC_ARG_ENABLE_PREFIX([openmp],
+  [enable OpenMP (optionally use --enable-openmp=<OPENMP_CFLAGS>)],
+  [OPENMP], [$1])
+if test "x$$1_ENABLE_OPENMP" != xno ; then
+  $1_OPENMP_CFLAGS="-fopenmp"
+  if test "x$$1_ENABLE_OPENMP" != xyes ; then
+    $1_OPENMP_CFLAGS="$$1_ENABLE_OPENMP"
+    dnl AC_MSG_ERROR([Please provide --enable-openmp without arguments])
+  fi
+  PRE_OPENMP_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS $$1_OPENMP_CFLAGS"
+  AC_LINK_IFELSE([AC_LANG_PROGRAM(
+[[
+#include <omp.h>
+]],[[
+  omp_set_num_threads (2);
+  #pragma omp parallel
+  {
+    int id = omp_get_thread_num ();
+  }
+]])],,
+                 [AC_MSG_ERROR([Unable to link with OpenMP])])
+dnl Keep the variables changed as done above
+dnl CFLAGS="$PRE_OPENMP_CFLAGS"
+
+  AC_MSG_RESULT([successful])
+else
+  AC_MSG_RESULT([not used])
+fi
+])
diff --git a/sc/config/sc_pthread.m4 b/sc/config/sc_pthread.m4
index 165a08a..ab2621e 100644
--- a/sc/config/sc_pthread.m4
+++ b/sc/config/sc_pthread.m4
@@ -2,8 +2,13 @@
 dnl SC_CHECK_PTHREAD(PREFIX)
 dnl Check for POSIX thread support and link a test program
 dnl
+dnl This macro tries to link to pthread_create both as is and with -lpthread.
+dnl If neither of this works, we throw an error.
+dnl Use the LIBS variable on the configure line to specify a different library.
+dnl
 AC_DEFUN([SC_CHECK_PTHREAD], [
 
+dnl This link test changes the LIBS variable in place for posterity
 SC_CHECK_LIB([pthread], [pthread_create], [LPTHREAD], [$1])
 AC_MSG_CHECKING([for POSIX threads])
 
diff --git a/sc/configure.ac b/sc/configure.ac
index 11c4f52..50199c5 100644
--- a/sc/configure.ac
+++ b/sc/configure.ac
@@ -4,7 +4,7 @@ dnl
 
 AC_INIT([libsc],
         [m4_esyscmd([build-aux/git-version-gen .tarball-version])],
-        [p4est at librelist.com])
+        [info at p4est.org])
 AC_PREREQ(2.61)
 AC_CONFIG_HEADERS([src/pre_config.h])
 AC_CONFIG_SRCDIR([src/sc.h])
@@ -35,6 +35,8 @@ See sc.h for possible log priorities in --enable-logging=PRIO])
                esac])
 SC_ARG_ENABLE([debug], [enable debug mode (assertions and extra checks)],
               [DEBUG])
+SC_ARG_DISABLE([realloc], [replace array/dmatrix resize with malloc/copy/free],
+               [USE_REALLOC])
 SC_ARG_WITH([papi], [enable Flop counting with papi], [PAPI])
 
 echo "o---------------------------------------"
@@ -51,17 +53,28 @@ dnl SC_C_VERSION
 LT_INIT
 
 echo "o---------------------------------------"
-echo "| Checking libraries"
+echo "| Checking keywords and types"
 echo "o---------------------------------------"
 
-SC_CHECK_LIBRARIES([SC])
+AC_C_BIGENDIAN([AC_DEFINE([IS_BIGENDIAN], 1, [Define to 1 on a bigendian machine])])
+AC_C_CONST
+AC_C_INLINE
+AC_C_RESTRICT
+AC_CHECK_SIZEOF([int])
+AC_CHECK_SIZEOF([long])
+AC_CHECK_SIZEOF([long long])
+AC_CHECK_SIZEOF([unsigned long])
+AC_CHECK_SIZEOF([unsigned long long])
+AC_CHECK_SIZEOF([void *])
+AC_TYPE_SIZE_T
+AC_TYPE_SSIZE_T
 
 echo "o---------------------------------------"
 echo "| Checking headers"
 echo "o---------------------------------------"
 
 AC_CHECK_HEADERS([execinfo.h signal.h sys/time.h sys/types.h time.h])
-AC_CHECK_HEADERS([lua.h lua5.1/lua.h lua5.2/lua.h])
+AC_CHECK_HEADERS([lua.h lua5.1/lua.h lua5.2/lua.h lua5.3/lua.h])
 
 echo "o---------------------------------------"
 echo "| Checking functions"
@@ -70,19 +83,10 @@ echo "o---------------------------------------"
 AC_CHECK_FUNCS([backtrace backtrace_symbols strtol strtoll])
 
 echo "o---------------------------------------"
-echo "| Checking keywords and types"
+echo "| Checking libraries"
 echo "o---------------------------------------"
 
-AC_C_BIGENDIAN([AC_DEFINE([IS_BIGENDIAN], 1, [Define to 1 on a bigendian machine])])
-AC_C_CONST
-AC_C_INLINE
-AC_C_RESTRICT
-AC_CHECK_SIZEOF([long])
-AC_CHECK_SIZEOF([long long])
-AC_CHECK_SIZEOF([unsigned long])
-AC_CHECK_SIZEOF([unsigned long long])
-AC_TYPE_SIZE_T
-AC_TYPE_SSIZE_T
+SC_CHECK_LIBRARIES([SC])
 
 # Print summary.
 
diff --git a/sc/doc/FREEBSD b/sc/doc/FREEBSD
new file mode 100644
index 0000000..9c76845
--- /dev/null
+++ b/sc/doc/FREEBSD
@@ -0,0 +1,23 @@
+Copyright (c) <YEAR>, <OWNER>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sc/doc/author_kestener.txt b/sc/doc/author_kestener.txt
new file mode 100644
index 0000000..7019afd
--- /dev/null
+++ b/sc/doc/author_kestener.txt
@@ -0,0 +1 @@
+I hereby place my contributions to libsc into the public domain under FreeBSD license.  Pierre Kestener.
diff --git a/sc/doc/sc_license_approval.pdf b/sc/doc/sc_license_approval.pdf
new file mode 100644
index 0000000..0d4bae5
Binary files /dev/null and b/sc/doc/sc_license_approval.pdf differ
diff --git a/sc/doc/v1.2.branches b/sc/doc/v1.2.branches
new file mode 100644
index 0000000..2d0fd7a
--- /dev/null
+++ b/sc/doc/v1.2.branches
@@ -0,0 +1,32 @@
+  attribute-unused                          # unused, skip
+  bgq                                       # this is a subset of tisaac/mpi
+  develop                                   # only alloc logging is worthwile
+  feature/shmem-array                       # subset of tisaac/mpi
+  ti/pu                                     # want sc_memory_status,
+                                            # not .gitignore file
+  tisaac/johann/gmg                         # subset of develop
+  tisaac/mpi
+  use-realloc                               # on top of develop, don't use
+  use-realloc-old                           # use this one: just what we want
+  remotes/carsten/configfile                # ask carsten
+  remotes/carsten/dset                      # this idea is undeveloped, skip
+  remotes/carsten/mpi                       # subset of tisaac/mpi
+  remotes/carsten/ng                        # ask carsten
+  remotes/carsten/poly                      # ask carsten
+  remotes/github/feature/shmem-array        # subset of tisaac/mpi
+  remotes/github/fix-make-dist-hook         # subset of tisaac/mpi
+  remotes/github/sc-allgather               # subset of tisaac/mpi
+  remotes/github/tisaac/feature-stats-has   # yes
+  remotes/github/tisaac/mpi                 # same as local
+  remotes/johann/next                       # doesn't seem ready yet
+  remotes/origin/bgq                        # same as local
+  remotes/origin/carsten-develop            # merge branch, don't use
+  remotes/origin/develop                    # same as local
+  remotes/origin/dset                       # same as local
+  remotes/origin/feature/shmem-array        # same as local
+  remotes/origin/fix-permute                # cherry-picked in
+  remotes/origin/redistribute               # seems unfinished, no
+  remotes/origin/ti/pu                      # same as local
+  remotes/origin/tisaac/johann/next         # just sc_statistics_has, no
+  remotes/origin/use-realloc                # same as local
+  remotes/origin/use-realloc-old            # same as local
diff --git a/sc/example/bspline/bspline.c b/sc/example/bspline/bspline.c
index f035244..a53c234 100644
--- a/sc/example/bspline/bspline.c
+++ b/sc/example/bspline/bspline.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/example/bspline/bspline_per.c b/sc/example/bspline/bspline_per.c
index aa966bd..969b563 100644
--- a/sc/example/bspline/bspline_per.c
+++ b/sc/example/bspline/bspline_per.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/example/dmatrix/dmatrix.c b/sc/example/dmatrix/dmatrix.c
index 784f93d..0c8132e 100644
--- a/sc/example/dmatrix/dmatrix.c
+++ b/sc/example/dmatrix/dmatrix.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/example/function/function.c b/sc/example/function/function.c
index 4b68fde..7c20efa 100644
--- a/sc/example/function/function.c
+++ b/sc/example/function/function.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/example/logging/logging.c b/sc/example/logging/logging.c
index de14e10..aea7de9 100644
--- a/sc/example/logging/logging.c
+++ b/sc/example/logging/logging.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/example/openmp/Makefile.am b/sc/example/openmp/Makefile.am
new file mode 100644
index 0000000..7357739
--- /dev/null
+++ b/sc/example/openmp/Makefile.am
@@ -0,0 +1,13 @@
+
+# This file is part of the SC Library
+# Makefile.am in example/openmp
+# included non-recursively from toplevel directory
+
+if SC_ENABLE_OPENMP
+
+bin_PROGRAMS += example/openmp/sc_openmp
+example_openmp_sc_openmp_SOURCES = example/openmp/openmp.c
+
+LINT_CSOURCES += $(example_openmp_sc_openmp_SOURCES)
+
+endif
diff --git a/sc/example/openmp/config_with_omp.txt b/sc/example/openmp/config_with_omp.txt
new file mode 100644
index 0000000..8cec945
--- /dev/null
+++ b/sc/example/openmp/config_with_omp.txt
@@ -0,0 +1,11 @@
+/homec/hbnxx/hbnxxx/source/libsc/configure --host=powerpc64-bgq-linux \
+CC=mpicc CXX=mpig++ F77=mpigfortran FC=mpigfortran F90=mpigfortran \
+CPPFLAGS="\
+-I/bgsys/local/szip/include -I/bgsys/local/zlib/include \
+-I/bgsys/drivers/ppcfloor/comm/include \
+-I/bgsys/drivers/ppcfloor/arch/include \
+-I/bgsys/drivers/ppcfloor/gnu-linux/powerpc-bgp-linux/sys-include" \
+CFLAGS="-O0 -g -shared-libgcc" \
+--enable-mpi --disable-shared --without-blas --enable-openmp="-qsmp=omp" \
+LDFLAGS="-L/bgsys/local/szip/lib -L/bgsys/local/zlib/lib" \
+LIBS="-ldl -lsz"
diff --git a/sc/example/openmp/juqueen_job.js b/sc/example/openmp/juqueen_job.js
new file mode 100644
index 0000000..bdd020f
--- /dev/null
+++ b/sc/example/openmp/juqueen_job.js
@@ -0,0 +1,26 @@
+# This jobscript can be submitted to the juqueen 
+# job queue via llsubmit [script]
+# Do not forget to put your email adress in the 
+# notify_user field.
+#
+# This script has to be executed within the folder of
+# the executable ./sc_openmp
+#
+# Notice that bg_size=32 is smallest possible number of nodes.
+# Any smaller input is automatically set to 32.
+
+# @ job_name = sc_openmp
+# @ comment = "Example libsc with openmp"
+# @ error = $(job_name).$(jobid).out
+# @ output = $(job_name).$(jobid).out
+# @ environment = COPY_ALL
+# @ wall_clock_limit = 00:30:00
+# @ notification = error
+# @ notify_user = yourname at yourserver.com
+# @ job_type = bluegene
+# @ bg_size = 32
+# @ queue
+
+export OMP_NUM_THREADS=4
+
+runjob --ranks-per-node 16 --exp-env OMP_NUM_THREADS : ./sc_openmp
diff --git a/sc/example/openmp/openmp.c b/sc/example/openmp/openmp.c
new file mode 100644
index 0000000..4d5d006
--- /dev/null
+++ b/sc/example/openmp/openmp.c
@@ -0,0 +1,42 @@
+#include <sc.h>
+#include <omp.h>
+
+omp_lock_t          writelock;
+
+void
+openmp_print_tid (void)
+{
+  omp_set_lock (&writelock);
+  SC_PRODUCTIONF ("Hello from thread %i.\n", omp_get_thread_num ());
+  omp_unset_lock (&writelock);
+}
+
+int
+main (int argc, char *argv[])
+{
+  int                 mpiret, mpisize;
+  int                 thread_lvl, num_threads;
+
+  mpiret =
+    sc_MPI_Init_thread (&argc, &argv, sc_MPI_THREAD_MULTIPLE, &thread_lvl);
+  SC_CHECK_MPI (mpiret);
+  sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
+
+  if (thread_lvl < sc_MPI_THREAD_MULTIPLE) {
+    SC_GLOBAL_PRODUCTIONF ("Mpi only supports thread level %d\n", thread_lvl);
+  }
+  else {
+    mpiret = sc_MPI_Comm_size (sc_MPI_COMM_WORLD, &mpisize);
+    SC_CHECK_MPI (mpiret);
+    num_threads = omp_get_max_threads ();
+    SC_GLOBAL_PRODUCTIONF ("Running on %i processes with %i threads each.\n",
+                           mpisize, num_threads);
+    omp_set_num_threads (num_threads);
+    omp_init_lock (&writelock);
+#pragma omp parallel
+    {
+      openmp_print_tid ();
+    }
+  }
+  return 0;
+}
diff --git a/sc/example/options/Makefile.am b/sc/example/options/Makefile.am
index b2aeefd..48c8ea8 100644
--- a/sc/example/options/Makefile.am
+++ b/sc/example/options/Makefile.am
@@ -3,9 +3,12 @@
 # Makefile.am in example/options
 # included non-recursively from toplevel directory
 
-bin_PROGRAMS += example/options/sc_options
+bin_PROGRAMS += example/options/sc_options example/options/sc_logging
 example_options_sc_options_SOURCES = example/options/options.c
+example_options_sc_logging_SOURCES = example/options/logging.c
 
-dist_scini_DATA += example/options/options.ini example/options/preload.ini
+dist_scini_DATA += \
+        example/options/sc_options_example.ini \
+        example/options/sc_options_preload.ini
 
 LINT_CSOURCES += $(example_options_sc_options_SOURCES)
diff --git a/sc/example/options/logging.c b/sc/example/options/logging.c
new file mode 100644
index 0000000..8a18594
--- /dev/null
+++ b/sc/example/options/logging.c
@@ -0,0 +1,89 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_options.h>
+
+static void
+run_program (void)
+{
+  SC_LDEBUG ("Debug\n");
+  SC_INFO ("Info\n");
+  SC_GLOBAL_INFO ("Info\n");
+  SC_GLOBAL_PRODUCTION ("Production\n");
+  SC_GLOBAL_ESSENTIAL ("Essential\n");
+}
+
+int
+main (int argc, char **argv)
+{
+  int                 mpiret;
+  int                 first_arg;
+  int                 verbosity;
+  sc_keyvalue_t      *priorities;
+  sc_options_t       *opt;
+
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+
+  sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
+
+  /* initialize key-value structure for option parsing */
+  priorities = sc_keyvalue_new ();
+  sc_keyvalue_set_int (priorities, "default", SC_LP_DEFAULT);
+  sc_keyvalue_set_int (priorities, "debug", SC_LP_DEBUG);
+  sc_keyvalue_set_int (priorities, "informative", SC_LP_INFO);
+  sc_keyvalue_set_int (priorities, "production", SC_LP_PRODUCTION);
+  sc_keyvalue_set_int (priorities, "essential", SC_LP_ESSENTIAL);
+  sc_keyvalue_set_int (priorities, "silent", SC_LP_SILENT);
+
+  /* initialize option structure */
+  opt = sc_options_new (argv[0]);
+  sc_options_add_keyvalue (opt, 'V', "verbosity", &verbosity, "default",
+                           priorities, "Choose the log level");
+
+  /* parse command line options */
+  first_arg = sc_options_parse (sc_package_id, SC_LP_ERROR, opt, argc, argv);
+  if (first_arg < 0) {
+    SC_GLOBAL_LERROR ("Option parsing failed\n");
+  }
+  else {
+    SC_GLOBAL_INFO ("Option parsing successful\n");
+    sc_options_print_summary (sc_package_id, SC_LP_PRODUCTION, opt);
+
+    /* set verbosity level */
+    sc_package_set_verbosity (sc_package_id, verbosity);
+
+    /* go to work */
+    run_program ();
+  }
+
+  /* cleanup */
+  sc_options_destroy_deep (opt);
+
+  sc_finalize ();
+
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+
+  return 0;
+}
diff --git a/sc/example/options/options.c b/sc/example/options/options.c
index fde4d7c..78aa6d0 100644
--- a/sc/example/options/options.c
+++ b/sc/example/options/options.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,9 @@
 
 #include <sc_options.h>
 
+/* quick and dirty -- please use callback's data argument instead */
+static int          w;
+
 static int
 callback (sc_options_t * opt, const char *theoptarg, void *data)
 {
@@ -31,6 +35,7 @@ callback (sc_options_t * opt, const char *theoptarg, void *data)
   else {
     SC_GLOBAL_INFOF ("%s with %s\n", (const char *) data, theoptarg);
   }
+  ++w;
 
   return 0;
 }
@@ -41,12 +46,13 @@ main (int argc, char **argv)
   int                 mpiret, retval;
   int                 rank;
   int                 first_arg;
-  int                 w;
   int                 i1, i2, si1;
+  int                 kvint;
   size_t              z;
   double              d, sd;
   const char         *s1, *s2, *ss1, *ss2;
   const char         *cd = "Callback example";
+  sc_keyvalue_t      *keyvalue;
   sc_options_t       *opt, *subopt;
 
   mpiret = sc_MPI_Init (&argc, &argv);
@@ -57,6 +63,10 @@ main (int argc, char **argv)
 
   sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
 
+  keyvalue = sc_keyvalue_new ();
+  sc_keyvalue_set_int (keyvalue, "one", 1);
+  sc_keyvalue_set_int (keyvalue, "two", 2);
+
   opt = sc_options_new (argv[0]);
   sc_options_add_switch (opt, 'w', "switch", &w, "Switch");
   sc_options_add_int (opt, 'i', "integer1", &i1, 0, "Integer 1");
@@ -69,7 +79,8 @@ main (int argc, char **argv)
   sc_options_add_string (opt, 't', NULL, &s2, NULL, "String 2");
   sc_options_add_inifile (opt, 'f', "inifile", ".ini file");
   sc_options_add_int (opt, '\0', "integer2", &i2, 7, "Integer 2");
-  sc_options_add_size_t (opt, 'z', "sizet", &z, (size_t) 7000000000ULL, "Size_t");
+  sc_options_add_size_t (opt, 'z', "sizet", &z, (size_t) 7000000000ULL,
+                         "Size_t");
 
   subopt = sc_options_new (argv[0]);
   sc_options_add_int (subopt, 'i', "integer", &si1, 0, "Subset integer");
@@ -77,11 +88,14 @@ main (int argc, char **argv)
   sc_options_add_string (subopt, 's', NULL, &ss1, NULL, "Subset string 1");
   sc_options_add_string (subopt, '\0', "string2", &ss2, NULL,
                          "Subset string 1");
+  sc_options_add_keyvalue (subopt, 'n', "number", &kvint, "one",
+                           keyvalue, "Subset keyvalue number");
 
   sc_options_add_suboptions (opt, subopt, "Subset");
 
   /* this is just to show off the load function */
-  if (!sc_options_load (sc_package_id, SC_LP_INFO, opt, "preload.ini")) {
+  if (!sc_options_load (sc_package_id, SC_LP_INFO, opt,
+                        "sc_options_preload.ini")) {
     SC_GLOBAL_INFO ("Preload successful\n");
   }
   else {
@@ -91,12 +105,13 @@ main (int argc, char **argv)
   first_arg = sc_options_parse (sc_package_id, SC_LP_INFO, opt, argc, argv);
   if (first_arg < 0) {
     sc_options_print_usage (sc_package_id, SC_LP_INFO, opt,
-                            "This is arg 1\nand this is arg 2");
+                            "Usage for arg 1\nand for arg 2");
     SC_GLOBAL_INFO ("Option parsing failed\n");
   }
   else {
     SC_GLOBAL_INFO ("Option parsing successful\n");
     sc_options_print_summary (sc_package_id, SC_LP_INFO, opt);
+    SC_GLOBAL_INFOF ("Keyvalue number is now %d\n", kvint);
 
     if (rank == 0) {
       retval = sc_options_save (sc_package_id, SC_LP_INFO, opt, "output.ini");
@@ -119,6 +134,7 @@ main (int argc, char **argv)
 
   sc_options_destroy (opt);
   sc_options_destroy (subopt);
+  sc_keyvalue_destroy (keyvalue);
 
   sc_finalize ();
 
diff --git a/sc/example/options/options.ini b/sc/example/options/sc_options_example.ini
similarity index 100%
rename from sc/example/options/options.ini
rename to sc/example/options/sc_options_example.ini
diff --git a/sc/example/options/preload.ini b/sc/example/options/sc_options_preload.ini
similarity index 100%
rename from sc/example/options/preload.ini
rename to sc/example/options/sc_options_preload.ini
diff --git a/sc/example/pthread/condvar.c b/sc/example/pthread/condvar.c
index 29a84d8..6bf3fbb 100644
--- a/sc/example/pthread/condvar.c
+++ b/sc/example/pthread/condvar.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/example/pthread/pthread.c b/sc/example/pthread/pthread.c
index a8f6b18..d42206e 100644
--- a/sc/example/pthread/pthread.c
+++ b/sc/example/pthread/pthread.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -27,7 +28,7 @@ typedef struct thread_data
 {
   pthread_t           thread;
   int                 id;
-  MPI_Comm            mpicomm;
+  sc_MPI_Comm         mpicomm;
 }
 thread_data_t;
 
diff --git a/sc/example/testing/Makefile.am b/sc/example/testing/Makefile.am
new file mode 100644
index 0000000..ae5a384
--- /dev/null
+++ b/sc/example/testing/Makefile.am
@@ -0,0 +1,9 @@
+
+# This file is part of the SC Library
+# Makefile.am in example/warp
+# included non-recursively from toplevel directory
+
+bin_PROGRAMS += example/testing/sc_test_shmem
+example_testing_sc_test_shmem_SOURCES = example/testing/sc_test_shmem.c
+
+LINT_CSOURCES += $(example_testing_sc_test_shmem_SOURCES)
diff --git a/sc/example/testing/sc_test_shmem.c b/sc/example/testing/sc_test_shmem.c
new file mode 100644
index 0000000..9025607
--- /dev/null
+++ b/sc/example/testing/sc_test_shmem.c
@@ -0,0 +1,324 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_shmem.h>
+
+#define DATA_SIZE 10
+
+/* This struct stores data which we use to test shared
+ * memory arrays.
+ */
+typedef struct
+{
+  int                 rank;     /*< This entry stores the rank of the creating process */
+  double              data[DATA_SIZE];  /*< This field can store arbitrary data */
+} data_t;
+
+#if 0
+/* For each process print the integer entry of
+ * each element in an array of type data_t.
+ */
+void
+test_shmem_print_int (data_t * array, sc_MPI_Comm comm)
+{
+  int                 i, p;
+  MPI_Aint            address;
+  int                 mpirank, mpisize, mpiret;
+  char                outstring[BUFSIZ];
+
+  mpiret = sc_MPI_Comm_size (comm, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (comm, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  mpiret = MPI_Get_address ((void *) array, &address);
+  SC_CHECK_MPI (mpiret);
+  outstring[0] = '\0';
+  snprintf (outstring + strlen (outstring), BUFSIZ - strlen (outstring),
+            "Array at %li:\t", (long) address);
+  for (i = 0; i < mpisize; i++)
+    /* loop over array entries */
+  {
+    snprintf (outstring + strlen (outstring), BUFSIZ - strlen (outstring),
+              "%i ", array[i].rank);
+  }
+
+  for (p = 0; p < mpisize; p++)
+    /* loop over procs */
+  {
+    if (mpirank == p) {
+      printf ("[H %i] %s\n", mpirank, outstring);
+      outstring[0] = '\0';
+      fflush (stdout);
+    }
+    sc_MPI_Barrier (comm);
+  }
+}
+#endif
+
+/* Check whether a given data item has entries
+ * data.rank = i
+ * data.data = {0,...,DATA_SIZE-1}
+ */
+int
+test_shmem_correct_data (data_t * data, int i)
+{
+  int                 j;
+  if (data->rank != i) {
+    return 0;
+  }
+  for (j = 0; j < DATA_SIZE; j++) {
+    if (data->data[j] != (double) j) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+/* Fill the array of one data item with
+ * the numbers 0,...,DATA_SIZE -1.
+ */
+void
+test_shmem_fill_data (data_t * data)
+{
+  int                 i;
+
+  for (i = 0; i < DATA_SIZE; i++) {
+    data->data[i] = (double) i;
+  }
+}
+
+/* allocate a shared memory array and fill the date fields */
+data_t *
+test_shmem_create_data_array (sc_shmem_type_t type, int mpirank, int mpisize)
+{
+  data_t              data;
+  data_t             *data_array;
+  int                 i;
+
+  data.rank = mpirank;
+  test_shmem_fill_data (&data);
+
+  sc_shmem_set_type (sc_MPI_COMM_WORLD, type);
+
+  data_array = SC_SHMEM_ALLOC (data_t, mpisize, sc_MPI_COMM_WORLD);
+  SC_CHECK_ABORT (data_array != NULL, "Allocation failed");
+
+  sc_shmem_allgather (&data, sizeof (data_t), sc_MPI_BYTE, data_array,
+                      sizeof (data_t), sc_MPI_BYTE, sc_MPI_COMM_WORLD);
+  /* check whether creation worked */
+  for (i = 0; i < mpisize; i++) {
+    SC_CHECK_ABORTF (test_shmem_correct_data (&data_array[i], i),
+                     "Error in shmem_allgather. Array entry %i is not correct.",
+                     i);
+  }
+  return data_array;
+}
+
+/* For a given shmem type, allocate a shared array
+ * and fill it with data via a call to shmem_allgather.
+ * We check wether all data was gathered correctly and
+ * free the array.
+ */
+void
+test_shmem_allgather (sc_shmem_type_t type)
+{
+  data_t             *data_array;
+  int                 mpirank, mpisize, mpiret;
+  int                 i;
+
+  SC_GLOBAL_ESSENTIALF ("Testing allgather with type %s.\n",
+                        sc_shmem_type_to_string[type]);
+  mpiret = sc_MPI_Comm_size (sc_MPI_COMM_WORLD, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (sc_MPI_COMM_WORLD, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  data_array = test_shmem_create_data_array (type, mpirank, mpisize);
+
+  for (i = 0; i < mpisize; i++) {
+    SC_CHECK_ABORTF (test_shmem_correct_data (&data_array[i], i),
+                     "Error in shmem_allgather. Array entry %i is not correct.",
+                     i);
+  }
+  SC_SHMEM_FREE (data_array, sc_MPI_COMM_WORLD);
+
+  SC_GLOBAL_ESSENTIALF ("Testing type %s succesful.\n",
+                        sc_shmem_type_to_string[type]);
+}
+
+/* create a shmem array, copy it and check whether the
+ * copy is the same as the original
+ */
+void
+test_shmem_copy (sc_shmem_type_t type)
+{
+  data_t             *data_array, *copy_array;
+  int                 mpirank, mpisize, mpiret;
+  int                 i;
+
+  SC_GLOBAL_ESSENTIALF ("Testing copy with type %s.\n",
+                        sc_shmem_type_to_string[type]);
+  mpiret = sc_MPI_Comm_size (sc_MPI_COMM_WORLD, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (sc_MPI_COMM_WORLD, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  data_array = test_shmem_create_data_array (type, mpirank, mpisize);
+  copy_array = SC_SHMEM_ALLOC (data_t, mpisize, sc_MPI_COMM_WORLD);
+
+  sc_shmem_memcpy ((void *) copy_array, (void *) data_array,
+                   mpisize * sizeof (data_t), sc_MPI_COMM_WORLD);
+  /* Check whether the copy worked */
+  for (i = 0; i < mpisize; i++) {
+    SC_CHECK_ABORTF (!memcmp
+                     (&data_array[i], &copy_array[i], sizeof (data_t)),
+                     "Error in shmem_copy. Array entries at %i do not match",
+                     i);
+  }
+
+  SC_SHMEM_FREE (data_array, sc_MPI_COMM_WORLD);
+  SC_SHMEM_FREE (copy_array, sc_MPI_COMM_WORLD);
+
+  SC_GLOBAL_ESSENTIALF ("Testing type %s succesful.\n",
+                        sc_shmem_type_to_string[type]);
+}
+
+void
+test_shmem_write (sc_shmem_type_t type)
+{
+  data_t             *data_array;
+  int                 mpirank, mpisize, mpiret;
+  int                 i;
+
+  SC_GLOBAL_ESSENTIALF ("Testing shmem_write with type %s.\n",
+                        sc_shmem_type_to_string[type]);
+  mpiret = sc_MPI_Comm_size (sc_MPI_COMM_WORLD, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (sc_MPI_COMM_WORLD, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  data_array = SC_SHMEM_ALLOC (data_t, mpisize, sc_MPI_COMM_WORLD);
+  /* The process that first gets write access to the array writes the data */
+  if (sc_shmem_write_start (data_array, sc_MPI_COMM_WORLD)) {
+    for (i = 0; i < mpisize; i++) {
+      data_array[i].rank = i;
+      test_shmem_fill_data (&data_array[i]);
+    }
+  }
+
+  sc_shmem_write_end (data_array, sc_MPI_COMM_WORLD);
+  mpiret = sc_MPI_Barrier (sc_MPI_COMM_WORLD);
+  SC_CHECK_MPI (mpiret);
+
+  /* All processes check whether writing worked */
+  for (i = 0; i < mpisize; i++) {
+    SC_CHECK_ABORTF (test_shmem_correct_data (&data_array[i], i),
+                     "Error in shmem_copy. Array entries at %i do not match",
+                     i);
+  }
+
+  SC_SHMEM_FREE (data_array, sc_MPI_COMM_WORLD);
+  SC_GLOBAL_ESSENTIALF ("Testing type %s succesful.\n",
+                        sc_shmem_type_to_string[type]);
+}
+
+void
+test_shmem_prefix (sc_shmem_type_t type)
+{
+  int                *data_array;
+  int                 mpirank, mpisize, mpiret;
+  int                 i;
+
+  SC_GLOBAL_ESSENTIALF ("Testing prefix with type %s.\n",
+                        sc_shmem_type_to_string[type]);
+  mpiret = sc_MPI_Comm_size (sc_MPI_COMM_WORLD, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (sc_MPI_COMM_WORLD, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  data_array = SC_SHMEM_ALLOC (int, mpisize + 1, sc_MPI_COMM_WORLD);
+
+  sc_shmem_prefix (&mpirank, data_array, 1, sc_MPI_INT, sc_MPI_SUM,
+                   sc_MPI_COMM_WORLD);
+
+  for (i = 0; i <= mpisize; i++) {
+    SC_CHECK_ABORTF (data_array[i] == i * (i - 1) / 2,
+                     "Error in shmem prefix."
+                     "Array entry at %i is not correct.\n", i);
+  }
+
+  SC_SHMEM_FREE (data_array, sc_MPI_COMM_WORLD);
+  SC_GLOBAL_ESSENTIALF ("Testing type %s succesful.\n",
+                        sc_shmem_type_to_string[type]);
+}
+
+void
+test_shmem_test1 ()
+{
+  int                 type;
+
+  SC_GLOBAL_ESSENTIAL ("Testing sc_shmem_allgather.\n");
+  sc_log_indent_push ();
+  for (type = (int) SC_SHMEM_BASIC; type < (int) SC_SHMEM_NUM_TYPES; type++) {
+    test_shmem_allgather ((sc_shmem_type_t) type);
+  }
+  sc_log_indent_pop ();
+  SC_GLOBAL_ESSENTIAL ("Testing sc_shmem_copy.\n");
+  sc_log_indent_push ();
+  for (type = (int) SC_SHMEM_BASIC; type < (int) SC_SHMEM_NUM_TYPES; type++) {
+    test_shmem_copy ((sc_shmem_type_t) type);
+  }
+  sc_log_indent_pop ();
+  SC_GLOBAL_ESSENTIAL ("Testing sc_shmem_write.\n");
+  sc_log_indent_push ();
+  for (type = (int) SC_SHMEM_BASIC; type < (int) SC_SHMEM_NUM_TYPES; type++) {
+    test_shmem_write ((sc_shmem_type_t) type);
+  }
+  sc_log_indent_pop ();
+  SC_GLOBAL_ESSENTIAL ("Testing sc_shmem_prefix.\n");
+  sc_log_indent_push ();
+  for (type = (int) SC_SHMEM_BASIC; type < (int) SC_SHMEM_NUM_TYPES; type++) {
+    test_shmem_prefix ((sc_shmem_type_t) type);
+  }
+  sc_log_indent_pop ();
+}
+
+int
+main (int argc, char *argv[])
+{
+  int                 mpiret;
+
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+
+  sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_ESSENTIAL);
+
+  test_shmem_test1 ();
+
+  sc_finalize ();
+
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+  return 0;
+}
diff --git a/sc/example/warp/warp.c b/sc/example/warp/warp.c
index 6fbba4b..72c1f4a 100644
--- a/sc/example/warp/warp.c
+++ b/sc/example/warp/warp.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/iniparser/Makefile.am b/sc/iniparser/Makefile.am
index 635f943..c167769 100644
--- a/sc/iniparser/Makefile.am
+++ b/sc/iniparser/Makefile.am
@@ -15,7 +15,7 @@ libsc_compiled_sources += $(iniparser_compiled_sources)
 LIBSC_CPPFLAGS += -I at top_srcdir@/iniparser
 
 EXTRA_DIST += iniparser/AUTHORS iniparser/LICENSE
-dist_scini_DATA += iniparser/twisted.ini
+dist_scini_DATA += iniparser/sc_iniparser_twisted.ini
 
 LINT_CSOURCES += \
 	$(iniparser_sc_iniexample_SOURCES)
diff --git a/sc/iniparser/twisted.ini b/sc/iniparser/sc_iniparser_twisted.ini
similarity index 100%
rename from sc/iniparser/twisted.ini
rename to sc/iniparser/sc_iniparser_twisted.ini
diff --git a/sc/src/Makefile.am b/sc/src/Makefile.am
index 19d6f2a..7a72479 100644
--- a/sc/src/Makefile.am
+++ b/sc/src/Makefile.am
@@ -7,6 +7,7 @@
 libsc_generated_headers = src/sc_config.h
 libsc_installed_headers = \
         src/sc.h src/sc_mpi.h src/sc_containers.h src/sc_avl.h \
+        src/sc_string.h src/sc_unique_counter.h src/sc_private.h \
         src/sc_options.h src/sc_functions.h src/sc_statistics.h \
         src/sc_ranges.h src/sc_io.h \
         src/sc_amr.h src/sc_search.h src/sc_sort.h \
@@ -14,18 +15,19 @@ libsc_installed_headers = \
         src/sc_bspline.h src/sc_flops.h \
         src/sc_getopt.h src/sc_obstack.h \
         src/sc_lua.h \
-	src/sc_keyvalue.h src/sc_warp.h \
+        src/sc_keyvalue.h src/sc_refcount.h src/sc_warp.h src/sc_shmem.h \
         src/sc_allgather.h src/sc_reduce.h src/sc_notify.h
 libsc_internal_headers =
 libsc_compiled_sources = \
         src/sc.c src/sc_mpi.c src/sc_containers.c src/sc_avl.c \
+        src/sc_string.c src/sc_unique_counter.c \
         src/sc_options.c src/sc_functions.c src/sc_statistics.c \
         src/sc_ranges.c src/sc_io.c \
         src/sc_amr.c src/sc_search.c src/sc_sort.c \
         src/sc_dmatrix.c src/sc_blas.c src/sc_lapack.c \
         src/sc_bspline.c src/sc_flops.c \
         src/sc_getopt.c src/sc_obstack.c src/sc_getopt1.c \
-	src/sc_keyvalue.c src/sc_warp.c \
+        src/sc_keyvalue.c src/sc_refcount.c src/sc_warp.c src/sc_shmem.c \
         src/sc_allgather.c src/sc_reduce.c src/sc_notify.c
 libsc_original_headers = \
         src/sc_builtin/getopt.h src/sc_builtin/getopt_int.h \
diff --git a/sc/src/sc.c b/sc/src/sc.c
index ee4c8a2..f8cfdb1 100644
--- a/sc/src/sc.c
+++ b/sc/src/sc.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -20,7 +21,7 @@
   02110-1301, USA.
 */
 
-#include <sc.h>
+#include <sc_private.h>
 
 #ifdef SC_HAVE_SIGNAL_H
 #include <signal.h>
@@ -38,6 +39,8 @@ typedef void        (*sc_sig_t) (int);
 #endif
 #endif
 
+#include <errno.h>
+
 #ifdef SC_ENABLE_PTHREAD
 #include <pthread.h>
 #endif
@@ -50,6 +53,8 @@ typedef struct sc_package
   int                 log_indent;
   int                 malloc_count;
   int                 free_count;
+  int                 rc_active;
+  int                 abort_mismatch;
   const char         *name;
   const char         *full;
 #ifdef SC_ENABLE_PTHREAD
@@ -91,6 +96,8 @@ int                 sc_trace_prio = SC_LP_STATISTICS;
 
 static int          default_malloc_count = 0;
 static int          default_free_count = 0;
+static int          default_rc_active = 0;
+static int          default_abort_mismatch = 1;
 
 static int          sc_identifier = -1;
 static sc_MPI_Comm  sc_mpicomm = sc_MPI_COMM_NULL;
@@ -99,6 +106,9 @@ static FILE        *sc_log_stream = NULL;
 static sc_log_handler_t sc_default_log_handler = sc_log_handler;
 static int          sc_default_log_threshold = SC_LP_THRESHOLD;
 
+static void         sc_abort_handler (void);
+static sc_abort_handler_t sc_default_abort_handler = sc_abort_handler;
+
 static int          sc_signals_caught = 0;
 static sc_sig_t     system_int_handler = NULL;
 static sc_sig_t     system_segv_handler = NULL;
@@ -142,27 +152,57 @@ sc_package_mutex (int package)
   }
 }
 
-static inline void
+#endif /* SC_ENABLE_PTHREAD */
+
+void
 sc_package_lock (int package)
 {
+#ifdef SC_ENABLE_PTHREAD
   pthread_mutex_t    *mutex = sc_package_mutex (package);
   int                 pth;
 
   pth = pthread_mutex_lock (mutex);
   sc_check_abort_thread (pth == 0, package, "sc_package_lock");
+#endif
 }
 
-static inline void
+void
 sc_package_unlock (int package)
 {
+#ifdef SC_ENABLE_PTHREAD
   pthread_mutex_t    *mutex = sc_package_mutex (package);
   int                 pth;
 
   pth = pthread_mutex_unlock (mutex);
   sc_check_abort_thread (pth == 0, package, "sc_package_unlock");
+#endif
 }
 
-#endif /* SC_ENABLE_PTHREAD */
+void
+sc_package_rc_count_add (int package_id, int toadd)
+{
+  int                *pcount;
+#ifdef SC_ENABLE_DEBUG
+  int                 newvalue;
+#endif
+
+  if (package_id == -1) {
+    pcount = &default_rc_active;
+  }
+  else {
+    SC_ASSERT (sc_package_is_registered (package_id));
+    pcount = &sc_packages[package_id].rc_active;
+  }
+
+  sc_package_lock (package_id);
+#ifdef SC_ENABLE_DEBUG
+  newvalue =
+#endif
+    *pcount += toadd;
+  sc_package_unlock (package_id);
+
+  SC_ASSERT (newvalue >= 0);
+}
 
 static void
 sc_signal_handler (int sig)
@@ -275,17 +315,222 @@ sc_free_count (int package)
   return &sc_packages[package].free_count;
 }
 
+#ifdef SC_ENABLE_MEMALIGN
+
+/* *INDENT-OFF* */
+static void        *
+sc_malloc_aligned (size_t alignment, size_t size)
+SC_ATTR_ALIGN (SC_MEMALIGN_BYTES);
+/* *INDENT-ON* */
+
+static void        *
+sc_malloc_aligned (size_t alignment, size_t size)
+{
+  /* minimum requirements on alignment */
+  SC_ASSERT (sizeof (char **) == sizeof (void *));
+  SC_ASSERT (sizeof (char **) >= sizeof (size_t));
+  SC_ASSERT (alignment > 0 && alignment % sizeof (void *) == 0);
+  SC_ASSERT (alignment == SC_MEMALIGN_BYTES);
+
+#if defined SC_HAVE_ANY_MEMALIGN && defined SC_HAVE_POSIX_MEMALIGN
+  {
+    void               *data = NULL;
+    int                 err = posix_memalign (&data, alignment, size);
+    SC_CHECK_ABORTF (err != ENOMEM, "Insufficient memory (malloc size %llu)",
+                     (long long unsigned) size);
+    SC_CHECK_ABORTF (err != EINVAL, "Alignment %llu is not a power of two"
+                     "or not a multiple of sizeof (void *)",
+                     (long long unsigned) alignment);
+    SC_CHECK_ABORTF (err == 0, "Return of %d from posix_memalign", err);
+    return data;
+  }
+#elif defined SC_HAVE_ANY_MEMALIGN && defined SC_HAVE_ALIGNED_ALLOC
+  {
+    void               *data = aligned_alloc (alignment, size);
+    SC_CHECK_ABORT (data != NULL || size == 0,
+                    "Returned NULL from aligned_alloc");
+    return data;
+  }
+#else
+  {
+#if 0
+    /* adapted from PetscMallocAlign */
+    int                *datastart = malloc (size + 2 * alignment);
+    int                 shift = ((uintptr_t) datastart) % alignment;
+
+    shift = (2 * alignment - shift) / sizeof (int);
+    datastart[shift - 1] = shift;
+    datastart += shift;
+    return (void *) datastart;
+#endif
+    /* We pad to achieve alignment, then write the original pointer and data
+     * size up front, then the real data shifted by at most alignment - 1
+     * bytes.  This way there is always at least one stop byte at the end that
+     * we can use for debugging. */
+    const ptrdiff_t     extrasize = (const ptrdiff_t) (2 * sizeof (char **));
+    const ptrdiff_t     signalign = (const ptrdiff_t) alignment;
+    const size_t        alloc_size = extrasize + size + alignment;
+    char               *alloc_ptr = (char *) malloc (alloc_size);
+    char               *ptr;
+    ptrdiff_t           shift, modu;
+
+    SC_CHECK_ABORT (alloc_ptr != NULL, "Returned NULL from malloc");
+
+    /* compute shift to the right where we put the actual data */
+    modu = ((ptrdiff_t) alloc_ptr + extrasize) % signalign;
+    shift = (signalign - modu) % signalign;
+    SC_ASSERT (0 <= shift && shift < signalign);
+
+    /* make sure the resulting pointer is fine */
+    ptr = alloc_ptr + (extrasize + shift);
+    SC_ASSERT ((ptrdiff_t) ptr % signalign == 0);
+
+    /* memorize the original pointer that we got from malloc and fill up */
+    SC_ARG_ALIGN (ptr, char *, SC_MEMALIGN_BYTES);
+
+    /* remember parameters of allocation for later use */
+    ((char **) ptr)[-1] = alloc_ptr;
+    ((char **) ptr)[-2] = (char *) size;
+#ifdef SC_ENABLE_DEBUG
+    memset (alloc_ptr, -2, shift);
+    SC_ASSERT (ptr + ((ptrdiff_t) size + signalign - shift) ==
+               alloc_ptr + alloc_size);
+    memset (ptr + size, -2, signalign - shift);
+#endif
+
+    /* and we are done */
+    return (void *) ptr;
+  }
+#endif
+}
+
+static void
+sc_free_aligned (void *ptr, size_t alignment)
+{
+  /* minimum requirements on alignment */
+  SC_ASSERT (sizeof (char **) == sizeof (void *));
+  SC_ASSERT (sizeof (char **) >= sizeof (size_t));
+  SC_ASSERT (alignment > 0 && alignment % sizeof (void *) == 0);
+
+#if defined SC_HAVE_ANY_MEMALIGN && \
+   (defined SC_HAVE_POSIX_MEMALIGN || defined SC_HAVE_ALIGNED_ALLOC)
+  free (ptr);
+#else
+  {
+#if 0
+    int                *datastart = ptr;
+    int                 shift = datastart[-1];
+
+    datastart -= shift;
+    free ((void *) datastart);
+#endif
+    /* this mirrors the function sc_malloc_aligned above */
+    char               *alloc_ptr;
+#ifdef SC_ENABLE_DEBUG
+    const ptrdiff_t     extrasize = (const ptrdiff_t) (2 * sizeof (char **));
+    const ptrdiff_t     signalign = (const ptrdiff_t) alignment;
+    ptrdiff_t           shift, modu, ssize, i;
+#endif
+
+    /* we excluded these cases earlier */
+    SC_ASSERT (ptr != NULL);
+    SC_ASSERT ((ptrdiff_t) ptr % signalign == 0);
+
+    alloc_ptr = ((char **) ptr)[-1];
+    SC_ASSERT (alloc_ptr != NULL);
+
+#ifdef SC_ENABLE_DEBUG
+    /* compute shift to the right where we put the actual data */
+    ssize = (ptrdiff_t) ((char **) ptr)[-2];
+    modu = ((ptrdiff_t) alloc_ptr + extrasize) % signalign;
+    shift = (signalign - modu) % signalign;
+    SC_ASSERT (0 <= shift && shift < signalign);
+    SC_ASSERT ((char *) ptr == alloc_ptr + (extrasize + shift));
+    for (i = 0; i < shift; ++i) {
+      SC_ASSERT (alloc_ptr[i] == -2);
+    }
+    for (i = 0; i < signalign - shift; ++i) {
+      SC_ASSERT (((char *) ptr)[ssize + i] == -2);
+    }
+#endif
+
+    /* free the original pointer */
+    free (alloc_ptr);
+  }
+#endif
+}
+
+/* *INDENT-OFF* */
+static void        *
+sc_realloc_aligned (void *ptr, size_t alignment, size_t size)
+SC_ATTR_ALIGN (SC_MEMALIGN_BYTES);
+/* *INDENT-ON* */
+
+static void        *
+sc_realloc_aligned (void *ptr, size_t alignment, size_t size)
+{
+  /* minimum requirements on alignment */
+  SC_ASSERT (sizeof (char **) == sizeof (void *));
+  SC_ASSERT (sizeof (char **) >= sizeof (size_t));
+  SC_ASSERT (alignment > 0 && alignment % sizeof (void *) == 0);
+  SC_ASSERT (alignment == SC_MEMALIGN_BYTES);
+
+#if defined SC_HAVE_ANY_MEMALIGN && \
+   (defined SC_HAVE_POSIX_MEMALIGN || defined SC_HAVE_ALIGNED_ALLOC)
+  /* the result is no longer aligned */
+  return realloc (ptr, size);
+#else
+  {
+#ifdef SC_ENABLE_DEBUG
+    const ptrdiff_t     signalign = (const ptrdiff_t) alignment;
+#endif
+    size_t              old_size, min_size;
+    void               *new_ptr;
+
+    /* we excluded these cases earlier */
+    SC_ASSERT (ptr != NULL && size > 0);
+    SC_ASSERT ((ptrdiff_t) ptr % signalign == 0);
+
+    /* back out the previously allocated size */
+    old_size = (size_t) ((char **) ptr)[-2];
+
+    /* create new memory while the old memory is still around */
+    new_ptr = sc_malloc_aligned (alignment, size);
+
+    /* copy data */
+    min_size = SC_MIN (old_size, size);
+    memcpy (new_ptr, ptr, min_size);
+#ifdef SC_ENABLE_DEBUG
+    memset ((char *) new_ptr + min_size, -3, size - min_size);
+#endif
+
+    /* free old memory and return new pointer */
+    sc_free_aligned (ptr, alignment);
+    return new_ptr;
+  }
+#endif
+}
+
+#endif /* SC_ENABLE_MEMALIGN */
+
 void               *
 sc_malloc (int package, size_t size)
 {
   void               *ret;
   int                *malloc_count = sc_malloc_count (package);
 
+  /* allocate memory */
+#if defined SC_ENABLE_MEMALIGN
+  ret = sc_malloc_aligned (SC_MEMALIGN_BYTES, size);
+#else
   ret = malloc (size);
   if (size > 0) {
-    SC_CHECK_ABORT (ret != NULL, "Allocation");
+    SC_CHECK_ABORTF (ret != NULL, "Allocation (malloc size %lli)",
+                     (long long int) size);
   }
+#endif
 
+  /* count the allocations */
 #ifdef SC_ENABLE_PTHREAD
   sc_package_lock (package);
 #endif
@@ -308,11 +553,19 @@ sc_calloc (int package, size_t nmemb, size_t size)
   void               *ret;
   int                *malloc_count = sc_malloc_count (package);
 
+  /* allocate memory */
+#if defined SC_ENABLE_MEMALIGN
+  ret = sc_malloc_aligned (SC_MEMALIGN_BYTES, nmemb * size);
+  memset (ret, 0, nmemb * size);
+#else
   ret = calloc (nmemb, size);
   if (nmemb * size > 0) {
-    SC_CHECK_ABORT (ret != NULL, "Allocation");
+    SC_CHECK_ABORTF (ret != NULL, "Allocation (calloc size %lli)",
+                     (long long int) size);
   }
+#endif
 
+  /* count the allocations */
 #ifdef SC_ENABLE_PTHREAD
   sc_package_lock (package);
 #endif
@@ -342,8 +595,13 @@ sc_realloc (int package, void *ptr, size_t size)
   else {
     void               *ret;
 
+#if defined SC_ENABLE_MEMALIGN
+    ret = sc_realloc_aligned (ptr, SC_MEMALIGN_BYTES, size);
+#else
     ret = realloc (ptr, size);
-    SC_CHECK_ABORT (ret != NULL, "Reallocation");
+    SC_CHECK_ABORTF (ret != NULL, "Reallocation (realloc size %lli)",
+                     (long long int) size);
+#endif
 
     return ret;
   }
@@ -369,7 +627,11 @@ sc_strdup (int package, const char *s)
 void
 sc_free (int package, void *ptr)
 {
-  if (ptr != NULL) {
+  if (ptr == NULL) {
+    return;
+  }
+  else {
+    /* uncount the allocations */
     int                *free_count = sc_free_count (package);
 
 #ifdef SC_ENABLE_PTHREAD
@@ -380,7 +642,13 @@ sc_free (int package, void *ptr)
     sc_package_unlock (package);
 #endif
   }
+
+  /* free memory */
+#if defined SC_ENABLE_MEMALIGN
+  sc_free_aligned (ptr, SC_MEMALIGN_BYTES);
+#else
   free (ptr);
+#endif
 }
 
 int
@@ -399,18 +667,46 @@ sc_memory_status (int package)
 }
 
 void
+sc_package_set_abort_alloc_mismatch (int package_id, int set_abort)
+{
+  if (package_id == -1) {
+    default_abort_mismatch = set_abort;
+  }
+  else {
+    sc_package_t       *p;
+
+    SC_ASSERT (sc_package_is_registered (package_id));
+    p = sc_packages + package_id;
+    p->abort_mismatch = set_abort;
+  }
+}
+
+void
 sc_memory_check (int package)
 {
   sc_package_t       *p;
 
-  if (package == -1)
-    SC_CHECK_ABORT (default_malloc_count == default_free_count,
-                    "Memory balance (default)");
+  if (package == -1) {
+    SC_CHECK_ABORT (default_rc_active == 0, "Leftover references (default)");
+    if (default_abort_mismatch) {
+      SC_CHECK_ABORT (default_malloc_count == default_free_count,
+                      "Memory balance (default)");
+    }
+    else if (default_malloc_count != default_free_count) {
+      SC_GLOBAL_LERROR ("Memory balance (default)\n");
+    }
+  }
   else {
     SC_ASSERT (sc_package_is_registered (package));
     p = sc_packages + package;
-    SC_CHECK_ABORTF (p->malloc_count == p->free_count,
-                     "Memory balance (%s)", p->name);
+    SC_CHECK_ABORTF (p->rc_active == 0, "Leftover references (%s)", p->name);
+    if (p->abort_mismatch) {
+      SC_CHECK_ABORTF (p->malloc_count == p->free_count,
+                       "Memory balance (%s)", p->name);
+    }
+    else if (p->malloc_count != p->free_count) {
+      SC_GLOBAL_LERRORF ("Memory balance (%s)\n", p->name);
+    }
   }
 }
 
@@ -478,8 +774,8 @@ sc_set_log_defaults (FILE * log_stream,
     sc_default_log_threshold = SC_LP_THRESHOLD;
   }
   else {
-    SC_ASSERT (log_threshold >= SC_LP_ALWAYS
-               && log_threshold <= SC_LP_SILENT);
+    SC_ASSERT (log_threshold >= SC_LP_ALWAYS &&
+               log_threshold <= SC_LP_SILENT);
     sc_default_log_threshold = log_threshold;
   }
 
@@ -601,8 +897,22 @@ sc_log_indent_pop_count (int package, int count)
 }
 
 void
+sc_set_abort_handler (sc_abort_handler_t abort_handler)
+{
+  sc_default_abort_handler = abort_handler != NULL ? abort_handler :
+    sc_abort_handler;
+}
+
+void
 sc_abort (void)
 {
+  sc_default_abort_handler ();
+  abort ();                     /* if the user supplied callback incorrecty returns, abort */
+}
+
+static void
+sc_abort_handler (void)
+{
   if (0) {
   }
 #ifdef SC_BACKTRACE
@@ -705,8 +1015,8 @@ sc_package_register (sc_log_handler_t log_handler, int log_threshold,
   int                 new_package_id = -1;
 
   SC_CHECK_ABORT (log_threshold == SC_LP_DEFAULT ||
-                  (log_threshold >= SC_LP_ALWAYS
-                   && log_threshold <= SC_LP_SILENT),
+                  (log_threshold >= SC_LP_ALWAYS &&
+                   log_threshold <= SC_LP_SILENT),
                   "Invalid package log threshold");
   SC_CHECK_ABORT (strcmp (name, "default"), "Package default forbidden");
   SC_CHECK_ABORT (strchr (name, ' ') == NULL,
@@ -748,6 +1058,7 @@ sc_package_register (sc_log_handler_t log_handler, int log_threshold,
       p->log_indent = 0;
       p->malloc_count = 0;
       p->free_count = 0;
+      p->rc_active = 0;
       p->name = NULL;
       p->full = NULL;
     }
@@ -759,6 +1070,8 @@ sc_package_register (sc_log_handler_t log_handler, int log_threshold,
   new_package->log_indent = 0;
   new_package->malloc_count = 0;
   new_package->free_count = 0;
+  new_package->rc_active = 0;
+  new_package->abort_mismatch = 1;
   new_package->name = name;
   new_package->full = full;
 #ifdef SC_ENABLE_PTHREAD
@@ -783,6 +1096,22 @@ sc_package_is_registered (int package_id)
 }
 
 void
+sc_package_set_verbosity (int package_id, int log_priority)
+{
+  sc_package_t       *p;
+
+  SC_CHECK_ABORT (sc_package_is_registered (package_id),
+                  "Package id is not registered");
+  SC_CHECK_ABORT (log_priority == SC_LP_DEFAULT ||
+                  (log_priority >= SC_LP_ALWAYS &&
+                   log_priority <= SC_LP_SILENT),
+                  "Invalid package log threshold");
+
+  p = sc_packages + package_id;
+  p->log_threshold = log_priority;
+}
+
+void
 sc_package_unregister (int package_id)
 {
 #ifdef SC_ENABLE_PTHREAD
@@ -799,6 +1128,7 @@ sc_package_unregister (int package_id)
   p->log_handler = NULL;
   p->log_threshold = SC_LP_DEFAULT;
   p->malloc_count = p->free_count = 0;
+  p->rc_active = 0;
 #ifdef SC_ENABLE_PTHREAD
   i = pthread_mutex_destroy (&p->mutex);
   SC_CHECK_ABORTF (i == 0, "Mutex destroy failed for package %s", p->name);
@@ -919,6 +1249,29 @@ sc_init (sc_MPI_Comm mpicomm,
   SC_GLOBAL_PRODUCTIONF ("%-*s %s\n", w, "LAPACK_LIBS", SC_LAPACK_LIBS);
   SC_GLOBAL_PRODUCTIONF ("%-*s %s\n", w, "FLIBS", SC_FLIBS);
 #endif
+
+#if defined(SC_ENABLE_MPI) && defined(SC_ENABLE_MPICOMMSHARED)
+  if (mpicomm != MPI_COMM_NULL) {
+    int                 mpiret;
+    MPI_Comm            intranode, internode;
+
+    /* compute the node comms by default */
+    sc_mpi_comm_attach_node_comms (mpicomm, 0);
+    sc_mpi_comm_get_node_comms (mpicomm, &intranode, &internode);
+    if (intranode == MPI_COMM_NULL) {
+      SC_GLOBAL_STATISTICS ("No shared memory node communicators\n");
+    }
+    else {
+      int                 intrasize;
+
+      mpiret = MPI_Comm_size (intranode, &intrasize);
+      SC_CHECK_MPI (mpiret);
+
+      SC_GLOBAL_STATISTICSF ("Shared memory node communicator size: %d\n",
+                             intrasize);
+    }
+  }
+#endif
 }
 
 void
@@ -927,6 +1280,10 @@ sc_finalize (void)
   int                 i;
   int                 retval;
 
+#if defined(SC_ENABLE_MPI) && defined(SC_ENABLE_MPICOMMSHARED)
+  sc_mpi_comm_detach_node_comms (sc_mpicomm);
+#endif
+
   /* sc_packages is static and thus initialized to all zeros */
   for (i = sc_num_packages_alloc - 1; i >= 0; --i)
     if (sc_packages[i].is_registered)
diff --git a/sc/src/sc.h b/sc/src/sc.h
index b87673d..7ab8650 100644
--- a/sc/src/sc.h
+++ b/sc/src/sc.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -43,6 +44,34 @@
 #define _sc_restrict restrict
 #endif
 
+/* test for gcc version without features.h */
+#define SC_CALC_VERSION(major,minor,patchlevel) \
+                       (((major) * 1000 + (minor)) * 1000 + (patchlevel))
+#ifdef __GNUC__
+#define SC_GCC_VERSION \
+        SC_CALC_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#else
+#define SC_GCC_VERSION \
+        SC_CALC_VERSION (0, 0, 0)
+#endif
+
+/* use this feature macro, be minimally invasive */
+#ifdef SC_ENABLE_MEMALIGN
+/* we disable the system-provided functions for the time being */
+#ifdef SC_HAVE_ANY_MEMALIGN
+#undef SC_HAVE_ANY_MEMALIGN
+#endif
+/* if system-provided functions are needed, give them the prototype */
+#ifdef SC_HAVE_ANY_MEMALIGN
+#ifndef SC_HAVE_POSIX_MEMALIGN
+#ifdef SC_HAVE_ALIGNED_ALLOC
+#define _ISOC11_SOURCE
+#endif
+#endif
+#endif
+/* done with memalign macros */
+#endif
+
 /* use this in case mpi.h includes stdint.h */
 
 #ifndef __STDC_LIMIT_MACROS
@@ -76,13 +105,16 @@
 
 /* provide extern C defines */
 
-/* the hacks below enable semicolons after the SC_EXTERN_C_ macros */
+/* The hacks below enable semicolons after the SC_EXTERN_C_ macros
+ * and also take care of the different semantics of () / (...) */
 #ifdef __cplusplus
 #define SC_EXTERN_C_BEGIN       extern "C" { void sc_extern_c_hack_1 (void)
 #define SC_EXTERN_C_END                    } void sc_extern_c_hack_2 (void)
+#define SC_NOARGS               ...
 #else
 #define SC_EXTERN_C_BEGIN                    void sc_extern_c_hack_3 (void)
 #define SC_EXTERN_C_END                      void sc_extern_c_hack_4 (void)
+#define SC_NOARGS
 #endif
 
 /* this libsc header is always included */
@@ -235,6 +267,37 @@ void                SC_CHECK_ABORTF (int success, const char *fmt, ...)
 #define SC_STRDUP(s)                sc_strdup (sc_package_id, (s))
 #define SC_FREE(p)                  sc_free (sc_package_id, (p))
 
+/* macros for memory alignment */
+/* some copied from bfam: https://github.com/bfam/bfam */
+
+#define SC_ALIGN_UP(x,n) ( ((n) <= 0) ? (x) : ((x) + (n) - 1) / (n) * (n) )
+
+#if defined (__bgq__)
+#define SC_ARG_ALIGN(p,t,n) __alignx((n), (p))
+#elif defined (__ICC)
+#define SC_ARG_ALIGN(p,t,n) __assume_aligned((p), (n))
+#elif defined (__clang__)
+#define SC_ARG_ALIGN(p,t,n) SC_NOOP ()
+#elif defined (__GNUC__) || defined (__GNUG__)
+
+#if SC_GCC_VERSION >= SC_CALC_VERSION (4, 7, 0)
+#define SC_ARG_ALIGN(p,t,n) do {                              \
+  (p) = (t) __builtin_assume_aligned((void *) (p), (n));      \
+} while (0)
+#else
+#define SC_ARG_ALIGN(p,t,n) SC_NOOP ()
+#endif
+
+#else
+#define SC_ARG_ALIGN(p,t,n) SC_NOOP ()
+#endif
+
+#if (defined __GNUC__) || (defined __PGI) || (defined __IBMC__)
+#define SC_ATTR_ALIGN(n) __attribute__((aligned(n)))
+#else
+#define SC_ATTR_ALIGN(n)
+#endif
+
 /**
  * Sets n elements of a memory range to zero.
  * Assumes the pointer p is of the correct type.
@@ -425,6 +488,7 @@ typedef void        (*sc_log_handler_t) (FILE * log_stream,
                                          const char *filename, int lineno,
                                          int package, int category,
                                          int priority, const char *msg);
+typedef void        (*sc_abort_handler_t) (void);
 
 /* memory allocation functions, will abort if out of memory */
 
@@ -455,6 +519,12 @@ void                sc_set_log_defaults (FILE * log_stream,
                                          sc_log_handler_t log_handler,
                                          int log_thresold);
 
+/** Controls the default SC abort behavior.
+ * \param [in] abort_handler Set default SC above handler (NULL selects
+ *                           builtin).  ***This function should not return!***
+ */
+void                sc_set_abort_handler (sc_abort_handler_t abort_handler);
+
 /** The central log function to be called by all packages.
  * Dispatches the log calls by package and filters by category and priority.
  * \param [in] package   Must be a registered package id or -1.
@@ -516,15 +586,62 @@ void                sc_abort_collective (const char *msg)
 int                 sc_package_register (sc_log_handler_t log_handler,
                                          int log_threshold,
                                          const char *name, const char *full);
+
+/** Query whether an identifier matches a registered package.
+ * \param [in] package_id       Only a non-negative id can be registered.
+ * \return                      True if and only if the package id is
+ *                              non-negative and package is registered.
+ */
 int                 sc_package_is_registered (int package_id);
 
+/** Acquire a pthread mutex lock.
+ * If configured without --enable-pthread, this function does nothing.
+ * This function must be followed with a matching \ref sc_package_unlock.
+ * \param [in] package_id       Either -1 for an undefined package or
+ *                              an id returned from \ref sc_package_register.
+ *                              Depending on the value, the appropriate mutex
+ *                              is chosen.  Thus, we may overlap locking calls
+ *                              with distinct package_id.
+ */
+void                sc_package_lock (int package_id);
+
+/** Release a pthread mutex lock.
+ * If configured without --enable-pthread, this function does nothing.
+ * This function must be follow a matching \ref sc_package_lock.
+ * \param [in] package_id       Either -1 for an undefined package or
+ *                              an id returned from \ref sc_package_register.
+ *                              Depending on the value, the appropriate mutex
+ *                              is chosen.  Thus, we may overlap locking calls
+ *                              with distinct package_id.
+ */
+void                sc_package_unlock (int package_id);
+
+/** Set the logging verbosity of a registered package.
+ * This can be called at any point in the program, any number of times.
+ * It can only lower the verbosity at and below the value of SC_LP_THRESHOLD.
+ * \param [in] package_id       Must be a registered package identifier.
+ */
+void                sc_package_set_verbosity (int package_id,
+                                              int log_priority);
+
+/** Set the unregister behavior of sc_package_unregister().
+ *
+ * \param[in] package_id    Must be -1 for the default package or
+ *                          the identifier of a registered package.
+ * \param[in] set_abort     True if sc_package_unregister() should abort if the
+ *                          number of allocs does not match the number of
+ *                          frees; false otherwise.
+ */
+void                sc_package_set_abort_alloc_mismatch (int package_id,
+                                                         int set_abort);
+
 /** Unregister a software package with SC.
  * This function must only be called after additional threads are finished.
  */
 void                sc_package_unregister (int package_id);
 
 /** Print a summary of all packages registered with SC.
- * Uses the SC_LP_GLOBAL log category which by default only prints on rank 0.
+ * Uses the SC_LC_GLOBAL log category which by default only prints on rank 0.
  * \param [in] log_priority     Priority passed to sc log functions.
  */
 void                sc_package_print_summary (int log_priority);
diff --git a/sc/src/sc_allgather.c b/sc/src/sc_allgather.c
index bbeaf9c..ffca2b1 100644
--- a/sc/src/sc_allgather.c
+++ b/sc/src/sc_allgather.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_allgather.h b/sc/src/sc_allgather.h
index bd1d5a1..d3caf7b 100644
--- a/sc/src/sc_allgather.h
+++ b/sc/src/sc_allgather.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_amr.c b/sc/src/sc_amr.c
index a0346f3..d14c0be 100644
--- a/sc/src/sc_amr.c
+++ b/sc/src/sc_amr.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_amr.h b/sc/src/sc_amr.h
index 96d3bd2..83d26b1 100644
--- a/sc/src/sc_amr.h
+++ b/sc/src/sc_amr.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_blas.c b/sc/src/sc_blas.c
index d42820a..c9cc27e 100644
--- a/sc/src/sc_blas.c
+++ b/sc/src/sc_blas.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -31,7 +32,7 @@ const char          sc_cmachchar[] =
 #ifndef SC_WITH_BLAS
 
 int
-sc_blas_nonimplemented ()
+sc_blas_nonimplemented (SC_NOARGS)
 {
   SC_ABORT ("BLAS not compiled in this configuration");
   return 0;
diff --git a/sc/src/sc_blas.h b/sc/src/sc_blas.h
index 1bb3aa9..9ceca9c 100644
--- a/sc/src/sc_blas.h
+++ b/sc/src/sc_blas.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -24,6 +25,11 @@
 #define SC_BLAS_H
 
 #include <sc.h>
+#if defined(__bgq__)
+/* TODO - FOR NOW WE DO NOT USE ESSL
+  # include <essl.h>
+*/
+#endif
 
 SC_EXTERN_C_BEGIN;
 
@@ -76,8 +82,16 @@ extern const char   sc_cmachchar[];
 #ifdef SC_WITH_BLAS
 
 #ifndef SC_F77_FUNC
+#if defined(__bgq__)            /* && defined(__HAVE_ESSL) */
+#define SC_F77_FUNC(small,CAPS) small
+/* TODO - For now we do not use ESSL
+  #   define SC_F77_FUNC(small,CAPS) small ## _
+*/
+#define SC_F77_FUNC_NOESSL(small,CAPS) small
+#else
 #define SC_F77_FUNC(small,CAPS) small ## _
 #endif
+#endif /* SC_F77_FUNC */
 
 #define SC_BLAS_DLAMCH  SC_F77_FUNC(dlamch,DLAMCH)
 #define SC_BLAS_DSCAL   SC_F77_FUNC(dscal,DSCAL)
@@ -125,7 +139,7 @@ void                SC_BLAS_DGEMM (const char *transa, const char *transb,
 #define SC_BLAS_DGEMM  (void)   sc_blas_nonimplemented
 #define SC_BLAS_DGEMV  (void)   sc_blas_nonimplemented
 
-int                 sc_blas_nonimplemented ();
+int                 sc_blas_nonimplemented (SC_NOARGS);
 
 #endif
 
diff --git a/sc/src/sc_bspline.c b/sc/src/sc_bspline.c
index c4dea6c..0225f7c 100644
--- a/sc/src/sc_bspline.c
+++ b/sc/src/sc_bspline.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_bspline.h b/sc/src/sc_bspline.h
index eb8a976..d97a16f 100644
--- a/sc/src/sc_bspline.h
+++ b/sc/src/sc_bspline.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_containers.c b/sc/src/sc_containers.c
index 89de654..f462f8c 100644
--- a/sc/src/sc_containers.c
+++ b/sc/src/sc_containers.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -166,9 +167,14 @@ void
 sc_array_resize (sc_array_t * array, size_t new_count)
 {
   size_t              newoffs, roundup, newsize;
+#if !defined SC_ENABLE_USE_REALLOC || defined SC_DEBUG
+  size_t              oldoffs, minoffs;
+#endif
+#ifndef SC_ENABLE_USE_REALLOC
+  char               *ptr;
+#endif
 #ifdef SC_DEBUG
-  size_t              oldoffs;
-  size_t              i, minoffs;
+  size_t              i;
 #endif
 
   if (!SC_ARRAY_IS_OWNER (array)) {
@@ -186,16 +192,19 @@ sc_array_resize (sc_array_t * array, size_t new_count)
     return;
   }
 
-#ifdef SC_DEBUG
+  /* Figure out how the array size will change */
+  newoffs = new_count * array->elem_size;
+#if defined SC_DEBUG || !defined SC_ENABLE_USE_REALLOC
   oldoffs = array->elem_count * array->elem_size;
+  minoffs = SC_MIN (oldoffs, newoffs);
 #endif
   array->elem_count = new_count;
-  newoffs = array->elem_count * array->elem_size;
   roundup = (size_t) SC_ROUNDUP2_64 (newoffs);
   SC_ASSERT (roundup >= newoffs && roundup <= 2 * newoffs);
 
   if (newoffs > (size_t) array->byte_alloc ||
       roundup < (size_t) array->byte_alloc) {
+    /* we will reallocate the array memory, either grow or shrink it */
     array->byte_alloc = (ssize_t) roundup;
   }
   else {
@@ -207,15 +216,24 @@ sc_array_resize (sc_array_t * array, size_t new_count)
       SC_ASSERT (array->array[i] == (char) -1);
     }
 #endif
+    /* we keep the current allocation */
     return;
   }
+
+  /* byte_alloc is the size to be realloced to, it may be smaller than oldoffs */
   SC_ASSERT ((size_t) array->byte_alloc >= newoffs);
 
   newsize = (size_t) array->byte_alloc;
+#ifdef SC_ENABLE_USE_REALLOC
   array->array = SC_REALLOC (array->array, char, newsize);
+#else
+  ptr = SC_ALLOC (char, newsize);
+  memcpy (ptr, array->array, minoffs);
+  SC_FREE (array->array);
+  array->array = ptr;
+#endif
 
 #ifdef SC_DEBUG
-  minoffs = SC_MIN (oldoffs, newoffs);
   SC_ASSERT (minoffs <= newsize);
   memset (array->array + minoffs, -1, newsize - minoffs);
 #endif
@@ -684,8 +702,28 @@ sc_containers_free (void *p)
 
 static void         (*obstack_chunk_free) (void *) = sc_containers_free;
 
-sc_mempool_t       *
-sc_mempool_new (size_t elem_size)
+/** This function is static; we do not like to expose _ext functions in libsc. */
+static void
+sc_mempool_init_ext (sc_mempool_t * mempool, size_t elem_size,
+                     int zero_and_persist)
+{
+  mempool->elem_size = elem_size;
+  mempool->elem_count = 0;
+  mempool->zero_and_persist = zero_and_persist;
+
+  obstack_init (&mempool->obstack);
+  sc_array_init (&mempool->freed, sizeof (void *));
+}
+
+void
+sc_mempool_init (sc_mempool_t * mempool, size_t elem_size)
+{
+  sc_mempool_init_ext (mempool, elem_size, 0);
+}
+
+/** This function is static; we do not like to expose _ext functions in libsc. */
+static sc_mempool_t *
+sc_mempool_new_ext (size_t elem_size, int zero_and_persist)
 {
   sc_mempool_t       *mempool;
 
@@ -694,21 +732,34 @@ sc_mempool_new (size_t elem_size)
 
   mempool = SC_ALLOC (sc_mempool_t, 1);
 
-  mempool->elem_size = elem_size;
-  mempool->elem_count = 0;
-
-  obstack_init (&mempool->obstack);
-  sc_array_init (&mempool->freed, sizeof (void *));
+  sc_mempool_init_ext (mempool, elem_size, zero_and_persist);
 
   return mempool;
 }
 
+sc_mempool_t       *
+sc_mempool_new (size_t elem_size)
+{
+  return sc_mempool_new_ext (elem_size, 0);
+}
+
+sc_mempool_t       *
+sc_mempool_new_zero_and_persist (size_t elem_size)
+{
+  return sc_mempool_new_ext (elem_size, 1);
+}
+
 void
-sc_mempool_destroy (sc_mempool_t * mempool)
+sc_mempool_reset (sc_mempool_t * mempool)
 {
   sc_array_reset (&mempool->freed);
   obstack_free (&mempool->obstack, NULL);
+}
 
+void
+sc_mempool_destroy (sc_mempool_t * mempool)
+{
+  sc_mempool_reset (mempool);
   SC_FREE (mempool);
 }
 
@@ -807,7 +858,7 @@ sc_list_unlink (sc_list_t * list)
   list->elem_count = 0;
 }
 
-void
+sc_link_t          *
 sc_list_prepend (sc_list_t * list, void *data)
 {
   sc_link_t          *lynk;
@@ -821,9 +872,10 @@ sc_list_prepend (sc_list_t * list, void *data)
   }
 
   ++list->elem_count;
+  return lynk;
 }
 
-void
+sc_link_t          *
 sc_list_append (sc_list_t * list, void *data)
 {
   sc_link_t          *lynk;
@@ -840,9 +892,10 @@ sc_list_append (sc_list_t * list, void *data)
   list->last = lynk;
 
   ++list->elem_count;
+  return lynk;
 }
 
-void
+sc_link_t          *
 sc_list_insert (sc_list_t * list, sc_link_t * pred, void *data)
 {
   sc_link_t          *lynk;
@@ -858,6 +911,7 @@ sc_list_insert (sc_list_t * list, sc_link_t * pred, void *data)
   }
 
   ++list->elem_count;
+  return lynk;
 }
 
 void               *
@@ -870,6 +924,7 @@ sc_list_remove (sc_list_t * list, sc_link_t * pred)
     return sc_list_pop (list);
   }
 
+  SC_ASSERT (list->first != NULL && list->last != NULL);
   SC_ASSERT (pred->next != NULL);
 
   lynk = pred->next;
@@ -890,7 +945,7 @@ sc_list_pop (sc_list_t * list)
   sc_link_t          *lynk;
   void               *data;
 
-  SC_ASSERT (list->first != NULL);
+  SC_ASSERT (list->first != NULL && list->last != NULL);
 
   lynk = list->first;
   list->first = lynk->next;
@@ -1001,7 +1056,7 @@ sc_hash_maybe_resize (sc_hash_t * hash)
       /* insert data into new slot list */
       j = hash->hash_fn (lynk->data, hash->user_data) % new_size;
       new_list = (sc_list_t *) sc_array_index (new_slots, j);
-      sc_list_prepend (new_list, lynk->data);
+      (void) sc_list_prepend (new_list, lynk->data);
       ++new_count;
 
       /* remove old list element */
@@ -1174,7 +1229,7 @@ sc_hash_insert_unique (sc_hash_t * hash, void *v, void ***found)
   }
 
   /* append new object to the list */
-  sc_list_append (list, v);
+  (void) sc_list_append (list, v);
   if (found != NULL) {
     *found = &list->last->data;
   }
diff --git a/sc/src/sc_containers.h b/sc/src/sc_containers.h
index 30eab7c..c07ed54 100644
--- a/sc/src/sc_containers.h
+++ b/sc/src/sc_containers.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -173,22 +174,26 @@ void                sc_array_init_size (sc_array_t * array,
                                         size_t elem_size, size_t elem_count);
 
 /** Initializes an already allocated (or static) view from existing sc_array_t.
+ * The array view returned does not require sc_array_reset (doesn't hurt though).
  * \param [in,out] view  Array structure to be initialized.
  * \param [in] array     The array must not be resized while view is alive.
  * \param [in] offset    The offset of the viewed section in element units.
  *                       This offset cannot be changed until the view is reset.
  * \param [in] length    The length of the view in element units.
  *                       The view cannot be resized to exceed this length.
+ *                       It is not necessary to call sc_array_reset later.
  */
 void                sc_array_init_view (sc_array_t * view, sc_array_t * array,
                                         size_t offset, size_t length);
 
 /** Initializes an already allocated (or static) view from given plain C data.
+ * The array view returned does not require sc_array_reset (doesn't hurt though).
  * \param [in,out] view     Array structure to be initialized.
  * \param [in] base         The data must not be moved while view is alive.
  * \param [in] elem_size    Size of one array element in bytes.
  * \param [in] elem_count   The length of the view in element units.
  *                          The view cannot be resized to exceed this length.
+ *                          It is not necessary to call sc_array_reset later.
  */
 void                sc_array_init_data (sc_array_t * view, void *base,
                                         size_t elem_size, size_t elem_count);
@@ -198,6 +203,9 @@ void                sc_array_init_data (sc_array_t * view, void *base,
  * \param [in,out]  array       Array structure to be reset.
  * \note Calling sc_array_init, then any array operations,
  *       then sc_array_reset is memory neutral.
+ *       As an exception, the two functions sc_array_init_view and
+ *       sc_array_init_data do not require a subsequent call to sc_array_reset.
+ *       Regardless, it is legal to call sc_array_reset anyway.
  */
 void                sc_array_reset (sc_array_t * array);
 
@@ -427,15 +435,16 @@ sc_array_index_int16 (sc_array_t * array, int16_t i16)
 static inline       size_t
 sc_array_position (sc_array_t * array, void *element)
 {
-  size_t              position;
+  ptrdiff_t           position;
 
   SC_ASSERT (array->array <= (char *) element);
-  SC_ASSERT (((char *) element - array->array) % array->elem_size == 0);
+  SC_ASSERT (((char *) element - array->array) %
+             (ptrdiff_t) array->elem_size == 0);
 
-  position = ((char *) element - array->array) / array->elem_size;
-  SC_ASSERT (position < array->elem_count);
+  position = ((char *) element - array->array) / (ptrdiff_t) array->elem_size;
+  SC_ASSERT (0 <= position && position < (ptrdiff_t) array->elem_count);
 
-  return position;
+  return (size_t) position;
 }
 
 /** Remove the last element from an array and return a pointer to it.
@@ -492,12 +501,16 @@ sc_array_push (sc_array_t * array)
  * Elements are referenced by their address which never changes.
  * Elements can be freed (that is, returned to the pool)
  *    and are transparently reused.
+ * If the zero_and_persist option is selected, new elements are initialized to
+ * all zeros on creation, and the contents of an element are not touched
+ * between freeing and re-returning it.
  */
 typedef struct sc_mempool
 {
   /* interface variables */
   size_t              elem_size;        /**< size of a single element */
   size_t              elem_count;       /**< number of valid elements */
+  int                 zero_and_persist; /**< Boolean; is set in constructor. */
 
   /* implementation variables */
   struct obstack      obstack;  /**< holds the allocated elements */
@@ -511,17 +524,33 @@ sc_mempool_t;
  */
 size_t              sc_mempool_memory_used (sc_mempool_t * mempool);
 
-/** Creates a new mempool structure.
+/** Creates a new mempool structure with the zero_and_persist option off.
+ * The contents of any elements returned by sc_mempool_alloc are undefined.
  * \param [in] elem_size  Size of one element in bytes.
  * \return Returns an allocated and initialized memory pool.
  */
 sc_mempool_t       *sc_mempool_new (size_t elem_size);
 
+/** Creates a new mempool structure with the zero_and_persist option on.
+ * The memory of newly created elements is zero'd out, and the contents of an
+ * element are not touched between freeing and re-returning it.
+ * \param [in] elem_size  Size of one element in bytes.
+ * \return Returns an allocated and initialized memory pool.
+ */
+sc_mempool_t       *sc_mempool_new_zero_and_persist (size_t elem_size);
+
+/** Same as sc_mempool_new, but for an already allocated sc_mempool_t pointer. */
+void                sc_mempool_init (sc_mempool_t * mempool,
+                                     size_t elem_size);
+
 /** Destroys a mempool structure.
  * All elements that are still in use are invalidated.
  */
 void                sc_mempool_destroy (sc_mempool_t * mempool);
 
+/** Same as sc_mempool_destroy, but does not free the pointer */
+void                sc_mempool_reset (sc_mempool_t * mempool);
+
 /** Invalidates all previously returned pointers, resets count to 0.
  */
 void                sc_mempool_truncate (sc_mempool_t * mempool);
@@ -544,10 +573,15 @@ sc_mempool_alloc (sc_mempool_t * mempool)
   }
   else {
     ret = obstack_alloc (&mempool->obstack, (int) mempool->elem_size);
+    if (mempool->zero_and_persist) {
+      memset (ret, 0, mempool->elem_size);
+    }
   }
 
 #ifdef SC_DEBUG
-  memset (ret, -1, mempool->elem_size);
+  if (!mempool->zero_and_persist) {
+    memset (ret, -1, mempool->elem_size);
+  }
 #endif
 
   return ret;
@@ -565,7 +599,9 @@ sc_mempool_free (sc_mempool_t * mempool, void *elem)
   SC_ASSERT (mempool->elem_count > 0);
 
 #ifdef SC_DEBUG
-  memset (elem, -1, mempool->elem_size);
+  if (!mempool->zero_and_persist) {
+    memset (elem, -1, mempool->elem_size);
+  }
 #endif
 
   --mempool->elem_count;
@@ -597,7 +633,7 @@ typedef struct sc_list
 }
 sc_list_t;
 
-/** Calculate the memory used by a list.
+/** Calculate the total memory used by a list.
  * \param [in] list        The list.
  * \param [in] is_dynamic  True if created with sc_list_new,
  *                         false if initialized with sc_list_init
@@ -605,52 +641,73 @@ sc_list_t;
  */
 size_t              sc_list_memory_used (sc_list_t * list, int is_dynamic);
 
-/** Allocate a linked list structure.
- * \param [in] allocator Memory allocator for sc_link_t, can be NULL.
+/** Allocate a new, empty linked list.
+ * \param [in] allocator    Memory allocator for sc_link_t, can be NULL
+ *                          in which case an internal allocator is created.
+ * \return                  Pointer to a newly allocated, empty list object.
  */
 sc_list_t          *sc_list_new (sc_mempool_t * allocator);
 
 /** Destroy a linked list structure in O(N).
+ * \param [in,out] list     All memory allocated for this list is freed.
  * \note If allocator was provided in sc_list_new, it will not be destroyed.
  */
 void                sc_list_destroy (sc_list_t * list);
 
-/** Initializes an already allocated list structure.
+/** Initialize a list object with an external link allocator.
  * \param [in,out]  list       List structure to be initialized.
- * \param [in]      allocator  External memory allocator for sc_link_t.
+ * \param [in]      allocator  External memory allocator for sc_link_t,
+ *                             which must exist already.
  */
 void                sc_list_init (sc_list_t * list, sc_mempool_t * allocator);
 
-/** Removes all elements from a list in O(N).
- * \param [in,out]  list       List structure to be resetted.
+/** Remove all elements from a list in O(N).
+ * \param [in,out]  list       List structure to be emptied.
  * \note Calling sc_list_init, then any list operations,
  *       then sc_list_reset is memory neutral.
  */
 void                sc_list_reset (sc_list_t * list);
 
-/** Unliks all list elements without returning them to the mempool.
- * This runs in O(1) but is dangerous because of potential memory leaks.
+/** Unlink all list elements without returning them to the mempool.
+ * This runs in O(1) but is dangerous because the link memory stays alive.
  * \param [in,out]  list       List structure to be unlinked.
  */
 void                sc_list_unlink (sc_list_t * list);
 
-void                sc_list_prepend (sc_list_t * list, void *data);
-void                sc_list_append (sc_list_t * list, void *data);
+/** Insert a list element at the beginning of the list.
+ * \param [in,out] list     Valid list object.
+ * \param [in] data         A new link is created holding this data.
+ * \return                  The link that has been created for data.
+ */
+sc_link_t          *sc_list_prepend (sc_list_t * list, void *data);
+
+/** Insert a list element at the end of the list.
+ * \param [in,out] list     Valid list object.
+ * \param [in] data         A new link is created holding this data.
+ * \return                  The link that has been created for data.
+ */
+sc_link_t          *sc_list_append (sc_list_t * list, void *data);
 
-/** Insert an element after a given position.
- * \param [in] pred The predecessor of the element to be inserted.
+/** Insert an element after a given list position.
+ * \param [in,out] list     Valid list object.
+ * \param [in,out] pred     The predecessor of the element to be inserted.
+ * \param [in] data         A new link is created holding this data.
+ * \return                  The link that has been created for data.
  */
-void                sc_list_insert (sc_list_t * list,
+sc_link_t          *sc_list_insert (sc_list_t * list,
                                     sc_link_t * pred, void *data);
 
-/** Remove an element after a given position.
+/** Remove an element after a given list position.
+ * \param [in,out] list     Valid, non-empty list object.
  * \param [in] pred  The predecessor of the element to be removed.
-                     If \a pred == NULL, the first element is removed.
- * \return Returns the data of the removed element.
+ *                   If \a pred == NULL, the first element is removed,
+ *                   which is equivalent to calling sc_list_pop (list).
+ * \return           The data of the removed and freed link.
  */
 void               *sc_list_remove (sc_list_t * list, sc_link_t * pred);
 
 /** Remove an element from the front of the list.
+ * \param [in,out] list     Valid, non-empty list object.
  * \return Returns the data of the removed first list element.
  */
 void               *sc_list_pop (sc_list_t * list);
diff --git a/sc/src/sc_dmatrix.c b/sc/src/sc_dmatrix.c
index 53c461f..21a7aa5 100644
--- a/sc/src/sc_dmatrix.c
+++ b/sc/src/sc_dmatrix.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -171,6 +172,59 @@ sc_dmatrix_new_view_offset (sc_bint_t o, sc_bint_t m, sc_bint_t n,
 }
 
 sc_dmatrix_t       *
+sc_dmatrix_new_view_column (sc_dmatrix_t * orig, sc_bint_t j)
+{
+  sc_dmatrix_t       *rdm;
+
+  SC_ASSERT (orig->m >= 0);
+  SC_ASSERT (0 <= j && j < orig->n);
+
+  rdm = SC_ALLOC (sc_dmatrix_t, 1);
+  sc_dmatrix_new_e (rdm, orig->m, orig->n, orig->e[0] + j);
+  rdm->n = 1;
+  rdm->view = 1;
+
+  return rdm;
+}
+
+void
+sc_dmatrix_view_set_column (sc_dmatrix_t * view,
+                            sc_dmatrix_t * orig, sc_bint_t j)
+{
+  const sc_bint_t     m = view->m;
+  sc_bint_t           i;
+
+  SC_ASSERT (view->view);
+  SC_ASSERT (view->m == orig->m);
+  SC_ASSERT (orig->m >= 0);
+  SC_ASSERT (0 <= j && j < orig->n);
+
+  view->e[0] = orig->e[0] + j;
+
+  if (m > 0) {
+    for (i = 1; i < m; ++i)
+      view->e[i] = view->e[i - 1] + orig->n;
+
+    view->e[m] = NULL;          /* safeguard */
+  }
+
+  view->n = 1;
+}
+
+void
+sc_dmatrix_view_set_row (sc_dmatrix_t * view,
+                         sc_dmatrix_t * orig, sc_bint_t i)
+{
+  SC_ASSERT (view->view);
+  SC_ASSERT (view->m == 1);
+  SC_ASSERT (orig->n >= 0);
+  SC_ASSERT (0 <= i && i < orig->m);
+
+  view->e[0] = orig->e[i];
+  view->n = orig->n;
+}
+
+sc_dmatrix_t       *
 sc_dmatrix_clone (const sc_dmatrix_t * X)
 {
   const sc_bint_t     totalsize = X->m * X->n;
@@ -212,7 +266,14 @@ sc_dmatrix_resize (sc_dmatrix_t * dmatrix, sc_bint_t m, sc_bint_t n)
   newsize = m * n;
 
   if (!dmatrix->view && size != newsize) {
+#ifdef SC_ENABLE_USE_REALLOC
     data = SC_REALLOC (dmatrix->e[0], double, newsize);
+#else
+    data = SC_ALLOC (double, newsize);
+    memcpy (data, dmatrix->e[0],
+            (size_t) SC_MIN (newsize, size) * sizeof (double));
+    SC_FREE (dmatrix->e[0]);
+#endif
   }
   else {
     /* for views you must know that data is large enough */
@@ -244,7 +305,14 @@ sc_dmatrix_resize_in_place (sc_dmatrix_t * dmatrix, sc_bint_t m, sc_bint_t n)
     }
   }
   if (newsize != size) {
+#ifdef SC_ENABLE_USE_REALLOC
     data = SC_REALLOC (dmatrix->e[0], double, newsize);
+#else
+    data = SC_ALLOC (double, newsize);
+    memcpy (data, dmatrix->e[0],
+            (size_t) SC_MIN (newsize, size) * sizeof (double));
+    SC_FREE (dmatrix->e[0]);
+#endif
   }
   if (n > old_n) {
     for (i = min_m - 1; i > 0; i--) {
@@ -334,6 +402,17 @@ sc_dmatrix_shift (double alpha, sc_dmatrix_t * X)
 }
 
 void
+sc_dmatrix_scale_shift (double alpha, double beta, sc_dmatrix_t * X)
+{
+  sc_bint_t           i;
+  const sc_bint_t     totalsize = X->m * X->n;
+  double             *Xdata = X->e[0];
+
+  for (i = 0; i < totalsize; ++i)
+    Xdata[i] = alpha * Xdata[i] + beta;
+}
+
+void
 sc_dmatrix_alphadivide (double alpha, sc_dmatrix_t * X)
 {
   sc_bint_t           i;
@@ -483,6 +562,24 @@ sc_dmatrix_dotdivide (const sc_dmatrix_t * X, sc_dmatrix_t * Y)
 }
 
 void
+sc_dmatrix_dotmultiply_add (const sc_dmatrix_t * A, const sc_dmatrix_t * X,
+                            sc_dmatrix_t * Y)
+{
+  sc_bint_t           i;
+  const sc_bint_t     totalsize = X->m * X->n;
+  const double       *Adata = A->e[0];
+  const double       *Xdata = X->e[0];
+  double             *Ydata = Y->e[0];
+
+  SC_ASSERT (X->m == A->m && X->n == A->n);
+  SC_ASSERT (X->m == Y->m && X->n == Y->n);
+
+  for (i = 0; i < totalsize; ++i) {
+    Ydata[i] += Adata[i] * Xdata[i];
+  }
+}
+
+void
 sc_dmatrix_copy (const sc_dmatrix_t * X, sc_dmatrix_t * Y)
 {
   const sc_bint_t     totalsize = X->m * X->n;
@@ -649,14 +746,14 @@ sc_dmatrix_rdivide (sc_trans_t transb, const sc_dmatrix_t * A,
     /* Perform an LU factorization of B. */
     SC_LAPACK_DGETRF (&N, &N, lu->e[0], &N, ipiv, &info);
 
-    SC_ASSERT (info == 0);
+    SC_CHECK_ABORT (info == 0, "Lapack routine DGETRF failed");
 
     /* Solve the linear system. */
     sc_dmatrix_copy (A, C);
     SC_LAPACK_DGETRS (&sc_transchar[transb], &N, &Nrhs, lu->e[0], &N,
                       ipiv, C->e[0], &N, &info);
 
-    SC_ASSERT (info == 0);
+    SC_CHECK_ABORT (info == 0, "Lapack routine DGETRS failed");
 
     SC_FREE (ipiv);
     sc_dmatrix_destroy (lu);
@@ -667,6 +764,22 @@ sc_dmatrix_rdivide (sc_trans_t transb, const sc_dmatrix_t * A,
 }
 
 void
+sc_dmatrix_solve_transpose_inplace (sc_dmatrix_t * A, sc_dmatrix_t * B)
+{
+  const sc_bint_t     N = A->m;
+  const sc_bint_t     nrhs = B->m;
+  sc_bint_t          *ipiv, info;
+
+  SC_ASSERT (A->n == N && B->n == N);
+
+  ipiv = SC_ALLOC (sc_bint_t, N);
+  SC_LAPACK_DGESV (&N, &nrhs, A->e[0], &N, ipiv, B->e[0], &N, &info);
+  SC_FREE (ipiv);
+
+  SC_CHECK_ABORT (info == 0, "Lapack routine DGESV failed");
+}
+
+void
 sc_dmatrix_write (const sc_dmatrix_t * dmatrix, FILE * fp)
 {
   sc_bint_t           i, j, m, n;
@@ -683,7 +796,7 @@ sc_dmatrix_write (const sc_dmatrix_t * dmatrix, FILE * fp)
 }
 
 sc_dmatrix_pool_t  *
-sc_dmatrix_pool_new (int m, int n)
+sc_dmatrix_pool_new (sc_bint_t m, sc_bint_t n)
 {
   sc_dmatrix_pool_t  *dmpool;
 
@@ -747,3 +860,54 @@ sc_dmatrix_pool_free (sc_dmatrix_pool_t * dmpool, sc_dmatrix_t * dm)
 
   *(sc_dmatrix_t **) sc_array_push (&dmpool->freed) = dm;
 }
+
+sc_darray_work_t   *
+sc_darray_work_new (const int n_threads, const int n_blocks,
+                    const int n_entries, const int alignment_bytes)
+{
+  const int           align_dbl = alignment_bytes / 8;
+  const int           n_entries_aligned = SC_ALIGN_UP (n_entries, align_dbl);
+  sc_darray_work_t   *work;
+
+  SC_ASSERT (0 < n_threads);
+  SC_ASSERT (0 < n_blocks);
+  SC_ASSERT (alignment_bytes <= 0 || (alignment_bytes % 8) == 0);
+
+  work = SC_ALLOC (sc_darray_work_t, 1);
+
+  work->data = SC_ALLOC (double, n_threads * n_blocks * n_entries_aligned);
+  work->n_threads = n_threads;
+  work->n_blocks = n_blocks;
+  work->n_entries = n_entries_aligned;
+
+  return work;
+}
+
+void
+sc_darray_work_destroy (sc_darray_work_t * work)
+{
+  SC_FREE (work->data);
+  SC_FREE (work);
+}
+
+double             *
+sc_darray_work_get (sc_darray_work_t * work, const int thread,
+                    const int block)
+{
+  SC_ASSERT (0 <= thread && thread < work->n_threads);
+  SC_ASSERT (0 <= block && block < work->n_blocks);
+
+  return work->data + work->n_entries * (work->n_blocks * thread + block);
+}
+
+int
+sc_darray_work_get_blockcount (sc_darray_work_t * work)
+{
+  return work->n_blocks;
+}
+
+int
+sc_darray_work_get_blocksize (sc_darray_work_t * work)
+{
+  return work->n_entries;
+}
diff --git a/sc/src/sc_dmatrix.h b/sc/src/sc_dmatrix.h
index 53931d4..7d24c65 100644
--- a/sc/src/sc_dmatrix.h
+++ b/sc/src/sc_dmatrix.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -23,16 +24,23 @@
 #ifndef SC_DMATRIX_H
 #define SC_DMATRIX_H
 
+/** \file sc_dmatrix.h
+ * Routines to create and manipulate small dense matrices of double.
+ * We use BLAS and LAPACK for more advanced linear algebra computations.
+ */
+
 #include <sc_blas.h>
 #include <sc_containers.h>
 
 SC_EXTERN_C_BEGIN;
 
+/** This is the matrix object.  It can have its own storage or be a view. */
 typedef struct sc_dmatrix
 {
-  double            **e;
-  sc_bint_t           m, n;
-  int                 view;
+  double            **e;        /**< Array into the rows of the matrix. */
+  sc_bint_t           m;        /**< Number of rows in this matrix. */
+  sc_bint_t           n;        /**< Number of columns in this matrix. */
+  int                 view;     /**< Boolean to indicate this is a view. */
 }
 sc_dmatrix_t;
 
@@ -54,17 +62,32 @@ int                 sc_darray_is_range (const double *darray, size_t nelem,
                                         double low, double high);
 
 /** Calculate the memory used by a dmatrix.
- * \param [in] array       The dmatrix.
+ * \param [in] dmatrix     The dmatrix.
  * \return                 Memory used in bytes.
  */
 size_t              sc_dmatrix_memory_used (sc_dmatrix_t * dmatrix);
 
-/*
- * The sc_dmatrix_new/clone functions abort on allocation errors.
- * There is no need to check the return value.
+/** Create a new uninitalized matrix object.
+ * This function aborts on memory allocation errors.
+ * \param [in] m            Number of rows.
+ * \param [in] n            Number of columns.
+ * \return                  A valid dmatrix object with uninitialized entries.
  */
 sc_dmatrix_t       *sc_dmatrix_new (sc_bint_t m, sc_bint_t n);
+
+/** Create a new matrix object with all entries set to zero.
+ * This function aborts on memory allocation errors.
+ * \param [in] m            Number of rows.
+ * \param [in] n            Number of columns.
+ * \return                  A valid dmatrix object storing all zeros.
+ */
 sc_dmatrix_t       *sc_dmatrix_new_zero (sc_bint_t m, sc_bint_t n);
+
+/** Create a new matrix object with the same size and entries as another.
+ * This function aborts on memory allocation errors.
+ * \param [in] dmatrix      A valid dmatrix or view.
+ * \return                  A valid dmatrix with size and entries of \b view.
+ */
 sc_dmatrix_t       *sc_dmatrix_clone (const sc_dmatrix_t * dmatrix);
 
 /** Create a matrix view on an existing data array.
@@ -77,6 +100,7 @@ sc_dmatrix_t       *sc_dmatrix_new_data (sc_bint_t m, sc_bint_t n,
 /** Create a matrix view on an existing sc_dmatrix_t.
  * The original matrix must have greater equal as many elements as the view.
  * The original matrix must not be destroyed or resized while view is in use.
+ * \note            Currently, creating views of views is not safe.
  */
 sc_dmatrix_t       *sc_dmatrix_new_view (sc_bint_t m, sc_bint_t n,
                                          sc_dmatrix_t * orig);
@@ -85,17 +109,57 @@ sc_dmatrix_t       *sc_dmatrix_new_view (sc_bint_t m, sc_bint_t n,
  * The start of the view is offset by a number of rows.
  * The original matrix must have greater equal as many elements as view end.
  * The original matrix must not be destroyed or resized while view is in use.
- * \param[in] o     Number of rows that the view is offset.
+ * \param [in] o    Number of rows that the view is offset.
  *                  Requires (o + m) * n <= orig->m * orig->n.
+ * \param [in] m    Number of rows that the view shall have.
+ * \param [in] n    Number of columns that the view shall have.
+ * \param [in] orig     This valid matrix is viewed.
+ * \return              A newly created mxn view onto \b orig.
+ * \note            Currently, creating views of views is not safe.
  */
 sc_dmatrix_t       *sc_dmatrix_new_view_offset (sc_bint_t o,
                                                 sc_bint_t m, sc_bint_t n,
                                                 sc_dmatrix_t * orig);
 
+/** Create a matrix view onto one column of an existing sc_dmatrix_t.
+ * \param [in] orig     This matrix must have at least one column.
+ *                      Its \b jth column is returned as a view.
+ * \param [in] j        Valid column index into \b orig.
+ * \return              A matrix of as many rows as \b orig and one column
+ *                      whose entries point at the jth column of \b orig.
+ * \note            Currently, creating views of views is not safe.
+ */
+sc_dmatrix_t       *sc_dmatrix_new_view_column (sc_dmatrix_t * orig,
+                                                sc_bint_t j);
+
+/** Change a matrix view to point at a single column of another matrix.
+ * \param [in,out] view     This must be a view and is modified in place.
+ *                          It must have the same number of rows as \b orig.
+ *                          On return, its number of columns will be one.
+ * \param [in] orig         The \b jth column of this matrix is viewed.
+ * \param [in] j            Valid column index into \b orig.
+ * \note            Currently, creating views of views is not safe.
+ */
+void                sc_dmatrix_view_set_column (sc_dmatrix_t * view,
+                                                sc_dmatrix_t * orig,
+                                                sc_bint_t j);
+
+/** Change a matrix view to point at a single row of another matrix.
+ * \param [in,out] view     This must be a view and is modified in place.
+ *                          It must have precisely one row.  On return,
+ *                          its number of columns will match \b orig.
+ * \param [in] orig         The \b jth row of this matrix is viewed.
+ * \param [in] i            Valid row index into \b orig.
+ * \note            Currently, creating views of views is not safe.
+ */
+void                sc_dmatrix_view_set_row (sc_dmatrix_t * view,
+                                             sc_dmatrix_t * orig,
+                                             sc_bint_t i);
+
 /** Reshape a matrix to different m and n without changing m * n.
  */
-void                sc_dmatrix_reshape (sc_dmatrix_t * dmatrix, sc_bint_t m,
-                                        sc_bint_t n);
+void                sc_dmatrix_reshape (sc_dmatrix_t * dmatrix,
+                                        sc_bint_t m, sc_bint_t n);
 
 /** Change the matrix dimensions.
  * For views it must be known that the new size is permitted.
@@ -115,7 +179,7 @@ void                sc_dmatrix_resize (sc_dmatrix_t * dmatrix,
 void                sc_dmatrix_resize_in_place (sc_dmatrix_t * dmatrix,
                                                 sc_bint_t m, sc_bint_t n);
 
-/** Destroy a dmatrix and all allocated memory */
+/** Destroy a dmatrix and all allocated memory. */
 void                sc_dmatrix_destroy (sc_dmatrix_t * dmatrix);
 
 /** Check whether a dmatrix is free of NaN entries.
@@ -124,13 +188,22 @@ void                sc_dmatrix_destroy (sc_dmatrix_t * dmatrix);
 int                 sc_dmatrix_is_valid (const sc_dmatrix_t * A);
 
 /** Check a square dmatrix for symmetry.
- * \param [in] tolerance    measures the absolute value of the max difference.
- * \return                  true if matrix is numerically symmetric.
+ * \param [in] A            This square dmatrix is checked for symmetry.
+ * \param [in] tolerance    Measures the absolute value of the max difference.
+ * \return                  true if and only if matrix is numerically symmetric.
  */
 int                 sc_dmatrix_is_symmetric (const sc_dmatrix_t * A,
                                              double tolerance);
 
+/** Set a matrix to all zero entries.
+ * \param [in,out] dmatrix  Valid dmatrix whose entries are zero'd.
+ */
 void                sc_dmatrix_set_zero (sc_dmatrix_t * dmatrix);
+
+/** Set all entries of a matrix to a constant.
+ * \param [in,out] dmatrix  Valid dmatrix whose entries are set to \b value.
+ * \param [in] value        This value is written into every entry of \b dmatrix.
+ */
 void                sc_dmatrix_set_value (sc_dmatrix_t * dmatrix,
                                           double value);
 
@@ -142,6 +215,11 @@ void                sc_dmatrix_scale (double alpha, sc_dmatrix_t * X);
  */
 void                sc_dmatrix_shift (double alpha, sc_dmatrix_t * X);
 
+/** Perform element-wise multipl. & addition w/ scalar, X := alpha .* X + beta.
+ */
+void                sc_dmatrix_scale_shift (double alpha, double beta,
+                                            sc_dmatrix_t * X);
+
 /** Perform element-wise divison with a scalar, X := alpha ./ X.
  */
 void                sc_dmatrix_alphadivide (double alpha, sc_dmatrix_t * X);
@@ -195,9 +273,25 @@ void                sc_dmatrix_dotmultiply (const sc_dmatrix_t * X,
 void                sc_dmatrix_dotdivide (const sc_dmatrix_t * X,
                                           sc_dmatrix_t * Y);
 
+/** Perform element-wise multiplication & addition, Y := A .* X + Y.
+ */
+void                sc_dmatrix_dotmultiply_add (const sc_dmatrix_t * A,
+                                                const sc_dmatrix_t * X,
+                                                sc_dmatrix_t * Y);
+
+/** Copy one matrix into another.
+ * \param [in] X        Matrix taken as a source.
+ * \param [in,out] Y    Matrix of dimensions of \b X.
+ *                      On output, its entries are set to X.
+ */
 void                sc_dmatrix_copy (const sc_dmatrix_t * X,
                                      sc_dmatrix_t * Y);
 
+/** Copy one matrix transposed into another.
+ * \param [in] X        Matrix taken as a source.
+ * \param [in,out] Y    Matrix of dimensions of \b X transposed.
+ *                      On output, its entries are set to X transposed.
+ */
 void                sc_dmatrix_transpose (const sc_dmatrix_t * X,
                                           sc_dmatrix_t * Y);
 
@@ -206,13 +300,16 @@ void                sc_dmatrix_transpose (const sc_dmatrix_t * X,
 void                sc_dmatrix_add (double alpha, const sc_dmatrix_t * X,
                                     sc_dmatrix_t * Y);
 
-/**
- * Perform matrix-vector multiplication Y = alpha * A * X + beta * Y.
- * \param [in] transa    Transpose operation for matrix A.
- * \param [in] transx    Transpose operation for matrix X.
- * \param [in] transy    Transpose operation for matrix Y.
- * \param [in] A         Matrix.
- * \param [in] X, Y      Column or row vectors (or one each).
+/** Perform matrix-vector multiplication Y = alpha * A * X + beta * Y.
+ * The dimensions of A, X, and Y must be compatible.
+ * \param [in] transa   Transpose operation for matrix A.
+ * \param [in] transx   Transpose operation for matrix X.
+ * \param [in] transy   Transpose operation for matrix Y.
+ * \param [in] alpha    Factor for the matrix to multiply.
+ * \param [in] A        Valid matrix or view.
+ * \param [in] X        Column or row vector.
+ * \param [in] beta     Factor for the original matrix.
+ * \param [in] Y        Column or row vector.
  */
 void                sc_dmatrix_vector (sc_trans_t transa,
                                        sc_trans_t transx,
@@ -221,11 +318,15 @@ void                sc_dmatrix_vector (sc_trans_t transa,
                                        const sc_dmatrix_t * X, double beta,
                                        sc_dmatrix_t * Y);
 
-/*! \brief Matrix Matrix Multiply  \c C := alpha * A * B + beta * C
- *
- *   \param A matrix
- *   \param B matrix
- *   \param C matrix
+/** Matrix-matrix multiplication \c C := alpha * A * B + beta * C
+ * The dimensions of A, B, and C must be compatible.
+ * \param [in] transa   Transpose operation for matrix A.
+ * \param [in] transb   Transpose operation for matrix B.
+ * \param [in] alpha    Factor for the matrix to multiply.
+ * \param [in] A        First matrix to multiply.
+ * \param [in] B        Secend Matrix to multiply.
+ * \param [in] beta     Factor for the original matrix.
+ * \param [in,out] C    Matrix is modified in place.
  */
 void                sc_dmatrix_multiply (sc_trans_t transa,
                                          sc_trans_t transb, double alpha,
@@ -261,6 +362,19 @@ void                sc_dmatrix_rdivide (sc_trans_t transb,
                                         const sc_dmatrix_t * B,
                                         sc_dmatrix_t * C);
 
+/** \brief Solve B^T <- A^{-T} B^T.
+ * This call is destructive on the entries of the matrix A.
+ * Solving multiple right hand sides is supported.
+ *
+ *   \param[in,out] A   Square invertible matrix.  Values are changed.
+ *                      Its transpose is inverted and applied to B^T.
+ *   \param[in,out] B   Rectangular matrix with as many columns as A.
+ *                      On input, each row is an independent right hand side.
+ *                      On output, each row holds the corresponding solution.
+ */
+void                sc_dmatrix_solve_transpose_inplace
+  (sc_dmatrix_t * A, sc_dmatrix_t * B);
+
 /** \brief Writes a matrix to an opened stream.
  *
  *   \param dmatrix Pointer to matrix to write
@@ -269,14 +383,13 @@ void                sc_dmatrix_rdivide (sc_trans_t transb,
 void                sc_dmatrix_write (const sc_dmatrix_t * dmatrix,
                                       FILE * fp);
 
-/*
- * The sc_dmatrix_pool recycles matrices of the same size.
- */
+/** The sc_dmatrix_pool recycles matrices of the same size. */
 typedef struct sc_dmatrix_pool
 {
-  int                 m, n;
-  size_t              elem_count;
-  sc_array_t          freed;    /* buffers the freed elements */
+  sc_bint_t           m;        /**< Number of rows of the matrices stored. */
+  sc_bint_t           n;        /**< NUmber of columns of matrices stored. */
+  size_t              elem_count;       /**< Number of matrices alive. */
+  sc_array_t          freed;    /**< Buffer for the matrices returned. */
 }
 sc_dmatrix_pool_t;
 
@@ -285,30 +398,81 @@ sc_dmatrix_pool_t;
  * \param [in] n    Column count of the stored matrices.
  * \return          Returns a dmatrix pool that is ready to use.
  */
-sc_dmatrix_pool_t  *sc_dmatrix_pool_new (int m, int n);
+sc_dmatrix_pool_t  *sc_dmatrix_pool_new (sc_bint_t m, sc_bint_t n);
 
 /** Destroy a dmatrix pool.
  * This will also destroy all matrices stored for reuse.
  * Requires all allocated matrices to be returned to the pool previously.
- * \param [in]      The dmatrix pool to destroy.
+ * \param [in,out] dmpool       The dmatrix pool to destroy.
  */
 void                sc_dmatrix_pool_destroy (sc_dmatrix_pool_t * dmpool);
 
 /** Allocate a dmatrix from the pool.
  * Reuses a matrix previously returned to the pool, or allocated a fresh one.
- * \param [in] pool   The dmatrix pool to use.
- * \return            Returns a dmatrix of size pool->m by pool->n.
+ * \param [in,out] dmpool   The dmatrix pool to use.
+ * \return                  Returns a matrix of size dmpool->m by dmpool->n.
  */
 sc_dmatrix_t       *sc_dmatrix_pool_alloc (sc_dmatrix_pool_t * dmpool);
 
 /** Return a dmatrix to the pool.
  * The matrix is stored internally for reuse and not freed in this function.
- * \param [in] pool   The dmatrix pool to use.
+ * \param [in] dmpool The dmatrix pool to use.
  * \param [in] dm     The dmatrix pool to return to the pool.
  */
 void                sc_dmatrix_pool_free (sc_dmatrix_pool_t * dmpool,
                                           sc_dmatrix_t * dm);
 
+/** Multithreaded workspace allocations of multiple blocks. */
+typedef struct sc_darray_work
+{
+  double             *data;       /**< Entries of all blocks of all threads */
+  int                 n_threads;  /**< Number of threads */
+  int                 n_blocks;   /**< Number of blocks per thread */
+  int                 n_entries;  /**< Number of entries per block */
+}
+sc_darray_work_t;
+
+/** Create a new multithreaded workspace allocation object.
+ * For each thread \c n_blocks of memory blocks with at least \c n_entries
+ * double values are allocated.  The actual number of entries per block is
+ * adjusted such that the base-pointer for each block is aligned to
+ * \c alignment_bytes.  It is assumed that SC_ALLOC returns an aligned
+ * base-pointer.  This function aborts on memory allocation errors.
+ * \param [in] n_threads        Number of thread.
+ * \param [in] n_blocks         Number of blocks per thread.
+ * \param [in] n_entries        Minimum number of entries per block.
+ * \param [in] alignment_bytes  Align blocks to this byte boundary.
+ * \return                      A valid darray_work object.
+ */
+sc_darray_work_t   *sc_darray_work_new (const int n_threads,
+                                        const int n_blocks,
+                                        const int n_entries,
+                                        const int alignment_bytes);
+
+/** Destroy a darray_work object and all allocated memory. */
+void                sc_darray_work_destroy (sc_darray_work_t * work);
+
+/** Get workspace allocation of a certain thread and a specified block.
+ * \param [in] work         Workspace taken as a source.
+ * \param [in] thread       Valid thread index into \b work.
+ * \param [in] block        Valid block index into \b work.
+ * \return                  Pointer to entries (memory aligned).
+ */
+double             *sc_darray_work_get (sc_darray_work_t * work,
+                                        const int thread, const int block);
+
+/** Get the number of blocks per thread of a workspace allocation.
+ * \param [in] work         Workspace taken as a source.
+ * \return                  Number of allocated blocks per thread.
+ */
+int                 sc_darray_work_get_blockcount (sc_darray_work_t * work);
+
+/** Get the number of entries per block of a workspace allocation.
+ * \param [in] work         Workspace taken as a source.
+ * \return                  Number of allocated entries per blocks per thread.
+ */
+int                 sc_darray_work_get_blocksize (sc_darray_work_t * work);
+
 SC_EXTERN_C_END;
 
 #endif /* !SC_DMATRIX_H */
diff --git a/sc/src/sc_flops.c b/sc/src/sc_flops.c
index e88ed07..194fa2f 100644
--- a/sc/src/sc_flops.c
+++ b/sc/src/sc_flops.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_flops.h b/sc/src/sc_flops.h
index af212ed..bd3a13a 100644
--- a/sc/src/sc_flops.h
+++ b/sc/src/sc_flops.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_functions.c b/sc/src/sc_functions.c
index 6c31085..ab06848 100644
--- a/sc/src/sc_functions.c
+++ b/sc/src/sc_functions.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -23,6 +24,60 @@
 #include <sc_functions.h>
 #include <time.h>
 
+int
+sc_intpow (int base, int exp)
+{
+  int                 result = 1;
+
+  SC_ASSERT (exp >= 0);
+
+  while (exp) {
+    if (exp & 1) {
+      result *= base;
+    }
+    exp >>= 1;
+    base *= base;
+  }
+
+  return result;
+}
+
+int64_t
+sc_intpow64 (int64_t base, int exp)
+{
+  int64_t             result = 1;
+
+  SC_ASSERT (exp >= 0);
+
+  while (exp) {
+    if (exp & 1) {
+      result *= base;
+    }
+    exp >>= 1;
+    base *= base;
+  }
+
+  return result;
+}
+
+uint64_t
+sc_intpow64u (uint64_t base, int exp)
+{
+  uint64_t            result = 1;
+
+  SC_ASSERT (exp >= 0);
+
+  while (exp) {
+    if (exp & 1) {
+      result *= base;
+    }
+    exp >>= 1;
+    base *= base;
+  }
+
+  return result;
+}
+
 double
 sc_function1_invert (sc_function1_t func, void *data,
                      double x_low, double x_high, double y, double rtol)
diff --git a/sc/src/sc_functions.h b/sc/src/sc_functions.h
index d147f96..4060940 100644
--- a/sc/src/sc_functions.h
+++ b/sc/src/sc_functions.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -27,6 +28,35 @@
 
 SC_EXTERN_C_BEGIN;
 
+/** Integer power routine.
+ * Found in github.com:bfam/bfam.git, and originally in
+ * http://stackoverflow.com/questions/101439/\
+ * the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int
+ *
+ * \param [in] base         This integer is taken to the power of \exp.
+ *                          It may be negative as well.
+ * \param [in] exp          This non-negative integer is the exponent.
+ * \return                  We compute \b base ** \b exp.
+ */
+int                 sc_intpow (int base, int exp);
+
+/* Power routine for 64-bit integers.
+ * \see sc_intpow.
+ * \param [in] base         This integer is taken to the power of \exp.
+ *                          It may be negative as well.
+ * \param [in] exp          This non-negative integer is the exponent.
+ * \return                  We compute \b base ** \b exp.
+ */
+int64_t             sc_intpow64 (int64_t base, int exp);
+
+/* Power routine for unsigned 64-bit integers.
+ * \see sc_intpow.
+ * \param [in] base         This integer is taken to the power of \exp.
+ * \param [in] exp          This non-negative integer is the exponent.
+ * \return                  We compute \b base ** \b exp.
+ */
+uint64_t            sc_intpow64u (uint64_t base, int exp);
+
 typedef double      (*sc_function1_t) (double x, void *data);
 
 typedef double      (*sc_function3_t) (double x, double y, double z,
diff --git a/sc/src/sc_getopt.h b/sc/src/sc_getopt.h
index 5e1ac38..d99ac9d 100644
--- a/sc/src/sc_getopt.h
+++ b/sc/src/sc_getopt.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_io.c b/sc/src/sc_io.c
index 831e16c..0f416b0 100644
--- a/sc/src/sc_io.c
+++ b/sc/src/sc_io.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -164,6 +165,21 @@ sc_io_sink_complete (sc_io_sink_t * sink,
   return SC_IO_ERROR_NONE;
 }
 
+int
+sc_io_sink_align (sc_io_sink_t * sink, size_t bytes_align)
+{
+  size_t              fill_bytes;
+  char               *fill;
+  int                 retval;
+
+  fill_bytes = (bytes_align - sink->bytes_out % bytes_align) % bytes_align;
+  fill = SC_ALLOC_ZERO (char, fill_bytes);
+  retval = sc_io_sink_write (sink, fill, fill_bytes);
+  SC_FREE (fill);
+
+  return retval;
+}
+
 sc_io_source_t     *
 sc_io_source_new (sc_io_type_t iotype, sc_io_encode_t encode, ...)
 {
@@ -210,8 +226,16 @@ sc_io_source_destroy (sc_io_source_t * source)
 {
   int                 retval;
 
-  /* The error value SC_IO_ERROR_AGAIN is turned into FATAL */
+  /* complete reading */
   retval = sc_io_source_complete (source, NULL, NULL);
+
+  /* destroy mirror */
+  if (source->mirror != NULL) {
+    retval = sc_io_sink_destroy (source->mirror) || retval;
+    sc_array_destroy (source->mirror_buffer);
+  }
+
+  /* The error value SC_IO_ERROR_AGAIN is turned into FATAL */
   if (source->iotype == SC_IO_TYPE_FILENAME) {
     SC_ASSERT (source->file != NULL);
 
@@ -253,6 +277,9 @@ sc_io_source_read (sc_io_source_t * source, void *data,
       if (bbytes_out < bytes_avail) {
         retval = !feof (source->file) || ferror (source->file);
       }
+      if (retval == SC_IO_ERROR_NONE && source->mirror != NULL) {
+        retval = sc_io_sink_write (source->mirror, data, bbytes_out);
+      }
     }
     else {
       retval = fseek (source->file, (long) bytes_avail, SEEK_CUR);
@@ -279,12 +306,20 @@ int
 sc_io_source_complete (sc_io_source_t * source,
                        size_t * bytes_in, size_t * bytes_out)
 {
+  int                 retval = SC_IO_ERROR_NONE;
+
   if (source->iotype == SC_IO_TYPE_BUFFER) {
     SC_ASSERT (source->buffer != NULL);
     if (source->buffer_bytes % source->buffer->elem_size != 0) {
       return SC_IO_ERROR_AGAIN;
     }
   }
+  else if (source->iotype == SC_IO_TYPE_FILENAME ||
+           source->iotype == SC_IO_TYPE_FILEFILE) {
+    if (source->mirror != NULL) {
+      retval = sc_io_sink_complete (source->mirror, NULL, NULL);
+    }
+  }
 
   if (bytes_in != NULL) {
     *bytes_in = source->bytes_in;
@@ -294,7 +329,57 @@ sc_io_source_complete (sc_io_source_t * source,
   }
   source->bytes_in = source->bytes_out = 0;
 
-  return SC_IO_ERROR_NONE;
+  return retval;
+}
+
+int
+sc_io_source_align (sc_io_source_t * source, size_t bytes_align)
+{
+  size_t              fill_bytes;
+
+  fill_bytes = (bytes_align - source->bytes_out % bytes_align) % bytes_align;
+
+  return sc_io_source_read (source, NULL, fill_bytes, NULL);
+}
+
+int
+sc_io_source_activate_mirror (sc_io_source_t * source)
+{
+  if (source->iotype == SC_IO_TYPE_BUFFER) {
+    return SC_IO_ERROR_FATAL;
+  }
+  if (source->mirror != NULL) {
+    return SC_IO_ERROR_FATAL;
+  }
+
+  source->mirror_buffer = sc_array_new (sizeof (char));
+  source->mirror = sc_io_sink_new (SC_IO_TYPE_BUFFER, SC_IO_MODE_WRITE,
+                                   SC_IO_ENCODE_NONE, source->mirror_buffer);
+
+  return (source->mirror != NULL ? SC_IO_ERROR_NONE : SC_IO_ERROR_FATAL);
+}
+
+int
+sc_io_source_read_mirror (sc_io_source_t * source, void *data,
+                          size_t bytes_avail, size_t * bytes_out)
+{
+  sc_io_source_t     *mirror_src;
+  int                 retval;
+
+  if (source->mirror_buffer == NULL) {
+    return SC_IO_ERROR_FATAL;
+  }
+
+  mirror_src = sc_io_source_new (SC_IO_TYPE_BUFFER, SC_IO_ENCODE_NONE,
+                                 source->mirror_buffer);
+  retval = (mirror_src != NULL ? SC_IO_ERROR_NONE : SC_IO_ERROR_FATAL);
+  retval = retval || sc_io_source_read (mirror_src, data, bytes_avail,
+                                        bytes_out);
+  if (mirror_src != NULL) {
+    retval = sc_io_source_destroy (mirror_src) || retval;
+  }
+
+  return retval;
 }
 
 int
diff --git a/sc/src/sc_io.h b/sc/src/sc_io.h
index 054b0e5..728d2ef 100644
--- a/sc/src/sc_io.h
+++ b/sc/src/sc_io.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -85,6 +86,8 @@ typedef struct sc_io_source
   FILE               *file;
   size_t              bytes_in;
   size_t              bytes_out;
+  sc_io_sink_t       *mirror;
+  sc_array_t         *mirror_buffer;
 }
 sc_io_source_t;
 
@@ -117,7 +120,7 @@ int                 sc_io_sink_destroy (sc_io_sink_t * sink);
  * The internal counters sink->bytes_in and sink->bytes_out are updated.
  * \param [in,out] sink         The sink object to write to.
  * \param [in] data             Data passed into sink.
- * \param [in] bytes_avail      Number of data bytes passed in.;
+ * \param [in] bytes_avail      Number of data bytes passed in.
  * \return                      0 on success, nonzero on error.
  */
 int                 sc_io_sink_write (sc_io_sink_t * sink,
@@ -145,6 +148,14 @@ int                 sc_io_sink_complete (sc_io_sink_t * sink,
                                          size_t * bytes_in,
                                          size_t * bytes_out);
 
+/** Align sink to a byte boundary by writing zeros.
+ * \param [in,out] sink         The sink object to align.
+ * \param [in] bytes_align      Byte boundary.
+ * \return                      0 on success, nonzero on error.
+ */
+int                 sc_io_sink_align (sc_io_sink_t * sink,
+                                      size_t bytes_align);
+
 /** Create a generic data source.
  * \param [in] iotype           Type of the source.
  *                              Depending on iotype, varargs must follow:
@@ -204,6 +215,30 @@ int                 sc_io_source_complete (sc_io_source_t * source,
                                            size_t * bytes_in,
                                            size_t * bytes_out);
 
+/** Align source to a byte boundary by skipping.
+ * \param [in,out] source       The source object to align.
+ * \param [in] bytes_align      Byte boundary.
+ * \return                      0 on success, nonzero on error.
+ */
+int                 sc_io_source_align (sc_io_source_t * source,
+                                        size_t bytes_align);
+
+/** Activate a buffer that mirrors (i.e., stores) the data that was read.
+ * \param [in,out] source       The source object to activate mirror in.
+ * \return                      0 on success, nonzero on error.
+ */
+int                 sc_io_source_activate_mirror (sc_io_source_t * source);
+
+/** Read data from the source's mirror.
+ * Same behaviour as sc_io_source_read.
+ * \param [in,out] source       The source object to read mirror data from.
+ * \return                      0 on success, nonzero on error.
+ */
+int                 sc_io_source_read_mirror (sc_io_source_t * source,
+                                              void *data,
+                                              size_t bytes_avail,
+                                              size_t * bytes_out);
+
 /** This function writes numeric binary data in VTK base64 encoding.
  * \param vtkfile        Stream openened for writing.
  * \param numeric_data   A pointer to a numeric data array.
diff --git a/sc/src/sc_keyvalue.c b/sc/src/sc_keyvalue.c
index c458954..0293390 100644
--- a/sc/src/sc_keyvalue.c
+++ b/sc/src/sc_keyvalue.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -37,6 +38,12 @@ typedef struct sc_keyvalue_entry
 }
 sc_keyvalue_entry_t;
 
+struct sc_keyvalue
+{
+  sc_hash_t          *hash;
+  sc_mempool_t       *value_allocator;
+};
+
 static unsigned
 sc_keyvalue_entry_hash (const void *v, const void *u)
 {
@@ -285,6 +292,39 @@ sc_keyvalue_get_pointer (sc_keyvalue_t * kv, const char *key, void *dvalue)
     return dvalue;
 }
 
+int
+sc_keyvalue_get_int_check (sc_keyvalue_t * kv, const char *key, int *status)
+{
+  int                 result;
+  int                 etype;
+  void              **found;
+  sc_keyvalue_entry_t svalue, *pvalue = &svalue;
+  sc_keyvalue_entry_t *value;
+
+  SC_ASSERT (kv != NULL);
+  SC_ASSERT (key != NULL);
+
+  result = (status != NULL) ? *status : INT_MIN;
+  etype = 1;
+  pvalue->key = key;
+  pvalue->type = SC_KEYVALUE_ENTRY_NONE;
+  if (sc_hash_lookup (kv->hash, pvalue, &found)) {
+    value = (sc_keyvalue_entry_t *) (*found);
+    if (value->type == SC_KEYVALUE_ENTRY_INT) {
+      etype = 0;
+      result = value->value.i;
+    }
+    else {
+      etype = 2;
+    }
+  }
+  SC_ASSERT (status != NULL || etype == 0);
+  if (status != NULL) {
+    *status = etype;
+  }
+  return result;
+}
+
 void
 sc_keyvalue_set_int (sc_keyvalue_t * kv, const char *key, int newvalue)
 {
diff --git a/sc/src/sc_keyvalue.h b/sc/src/sc_keyvalue.h
index 82184bc..b8adef3 100644
--- a/sc/src/sc_keyvalue.h
+++ b/sc/src/sc_keyvalue.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -23,68 +24,173 @@
 #ifndef SC_KEYVALUE_H
 #define SC_KEYVALUE_H
 
+/** \file sc_keyvalue.h
+ * This file provides a lookup structure for key-value pairs.
+ */
+
 #include <sc.h>
 #include <sc_containers.h>
 
 SC_EXTERN_C_BEGIN;
 
+/** The values can have different types. */
 typedef enum
 {
-  SC_KEYVALUE_ENTRY_NONE = 0,
-  SC_KEYVALUE_ENTRY_INT,
-  SC_KEYVALUE_ENTRY_DOUBLE,
-  SC_KEYVALUE_ENTRY_STRING,
-  SC_KEYVALUE_ENTRY_POINTER
+  SC_KEYVALUE_ENTRY_NONE = 0,   /**< Designate an invalid situation. */
+  SC_KEYVALUE_ENTRY_INT,        /**< Used for values of type int. */
+  SC_KEYVALUE_ENTRY_DOUBLE,     /**< Used for values of type double. */
+  SC_KEYVALUE_ENTRY_STRING,     /**< Used for values of type const char *. */
+  SC_KEYVALUE_ENTRY_POINTER     /**< Used for values of anonymous pointer type. */
 }
 sc_keyvalue_entry_type_t;
 
-typedef struct sc_keyvalue
-{
-  sc_hash_t          *hash;
-  sc_mempool_t       *value_allocator;
-}
-sc_keyvalue_t;
+/** The key-value container is an opaque structure. */
+typedef struct sc_keyvalue sc_keyvalue_t;
 
-/* Constructors / destructors */
+/** Create a new key-value container.
+ * \return          The container is ready to use.
+ */
 sc_keyvalue_t      *sc_keyvalue_new ();
 
-/* Arguments come in pairs of 2: static string "type:key" and value;
-   type is a letter like the identifier names in sc_keyvalue_entry.value */
+/** Create a container and set one or more key-value pairs.
+ * Arguments come in pairs of 2: a static string "type:key" and a value.
+ * The type is the letter i, g, s, p for int, double, const char *, and void *,
+ * respectively.
+ * \param [in] dummy            Not touched, just to use the varargs feature.
+ * \return                      A key-value container initialized with the given entries.
+ */
 sc_keyvalue_t      *sc_keyvalue_newf (int dummy, ...);
+
+/** Create a container and set one or more key-value pairs.
+ * This function works analogously to \ref sc_keyvalue_newf.
+ * Arguments come in pairs of 2: a static string "type:key" and a value.
+ * The type is the letter i, g, s, p for int, double, const char *, and void *,
+ * respectively.
+ * \param [in] ap               Varargs pointer; see stdarg.h for the syntax.
+ * \return                      A key-value container initialized with the given entries.
+ */
 sc_keyvalue_t      *sc_keyvalue_newv (va_list ap);
 
+/** Free a key-value container and all internal memory for key storage.
+ * \param [in,out] kv           The key-value container is invalidated by this call.
+ */
 void                sc_keyvalue_destroy (sc_keyvalue_t * kv);
 
-/* Routine to check existence of an entry
-   returns the type if found, and SC_KEYVALUE_ENTRY_NONE otherwise */
+/** Routine to check existence of an entry.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] key              Lookup key to query.
+ * \return                      The entry's type if found
+ *                              and SC_KEYVALUE_ENTRY_NONE otherwise.
+ */
 sc_keyvalue_entry_type_t sc_keyvalue_exists (sc_keyvalue_t * kv,
                                              const char *key);
 
-/* Routine to remove an entry
-   returne type if found and removed, SC_KEYVALUE_ENTRY_NONE otherwise */
+/** Routine to remove an entry.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] key              Lookup key to remove if it exists.
+ * \return                      The entry's type if found and removed,
+ *                              SC_KEYVALUE_ENTRY_NONE otherwise.
+ */
 sc_keyvalue_entry_type_t sc_keyvalue_unset (sc_keyvalue_t * kv,
                                             const char *key);
 
-/* Routines to extract values from keys
-   if the key is not present then dvalue is returned */
+/** Routines to retrieve an integer value by its key.
+ * This function asserts that the key, if existing, points to the correct type.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] key              Lookup key, may or may not exist.
+ * \param [in] dvalue           Default value returned if key is not found.
+ * \return                      If key is not present then \b dvalue is returned,
+ *                              otherwise the value stored under \b key.
+ */
 int                 sc_keyvalue_get_int (sc_keyvalue_t * kv,
                                          const char *key, int dvalue);
+
+/** Retrieve a double value by its key.
+ * This function asserts that the key, if existing, points to the correct type.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] key              Lookup key, may or may not exist.
+ * \param [in] dvalue           Default value returned if key is not found.
+ * \return                      If key is not present then \b dvalue is returned,
+ *                              otherwise the value stored under \b key.
+ */
 double              sc_keyvalue_get_double (sc_keyvalue_t * kv,
                                             const char *key, double dvalue);
+
+/** Retrieve a string value by its key.
+ * This function asserts that the key, if existing, points to the correct type.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] key              Lookup key, may or may not exist.
+ * \param [in] dvalue           Default value returned if key is not found.
+ * \return                      If key is not present then \b dvalue is returned,
+ *                              otherwise the value stored under \b key.
+ */
 const char         *sc_keyvalue_get_string (sc_keyvalue_t * kv,
                                             const char *key,
                                             const char *dvalue);
+
+/** Retrieve a pointer value by its key.
+ * This function asserts that the key, if existing, points to the correct type.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] key              Lookup key, may or may not exist.
+ * \param [in] dvalue           Default value returned if key is not found.
+ * \return                      If key is not present then \b dvalue is returned,
+ *                              otherwise the value stored under \b key.
+ */
 void               *sc_keyvalue_get_pointer (sc_keyvalue_t * kv,
                                              const char *key, void *dvalue);
 
-/* Routines to set values for a given key */
+/** Query an integer key with error checking.
+ * We check whether the key is not found or it is of the wrong type.
+ * A default value to be returned on error can be passed in as *status.
+ * If status is NULL, then the result on error is undefined.
+ * \param [in] kv           Valid key-value table.
+ * \param [in] key          Non-NULL key string.
+ * \param [in,out] status   If not NULL, set to
+ *                          0 if there is no error,
+ *                          1 if the key is not found,
+ *                          2 if a value is found but its type is not integer,
+ *                          and return the input value *status on error.
+ * \return                  On error we return *status if status is not NULL,
+ *                          and else an undefined value backed by an assertion.
+ *                          Without error, return the result of the lookup.
+ */
+int                 sc_keyvalue_get_int_check (sc_keyvalue_t * kv,
+                                               const char *key, int *status);
+
+/** Routine to set an integer value for a given key.
+ * \param [in] kv           Valid key-value table.
+ * \param [in] key          Non-NULL key to insert or replace.
+ *                          If it already exists, it must be of type integer.
+ * \param [in] newvalue     New value will be stored under key.
+ */
 void                sc_keyvalue_set_int (sc_keyvalue_t * kv,
                                          const char *key, int newvalue);
+
+/** Routine to set a double value for a given key.
+ * \param [in] kv           Valid key-value table.
+ * \param [in] key          Non-NULL key to insert or replace.
+ *                          If it already exists, it must be of type double.
+ * \param [in] newvalue     New value will be stored under key.
+ */
 void                sc_keyvalue_set_double (sc_keyvalue_t * kv,
                                             const char *key, double newvalue);
+
+/** Routine to set a string value for a given key.
+ * \param [in] kv           Valid key-value table.
+ * \param [in] key          Non-NULL key to insert or replace.
+ *                          If it already exists, it must be of type string.
+ * \param [in] newvalue     New value will be stored under key.
+ */
 void                sc_keyvalue_set_string (sc_keyvalue_t * kv,
                                             const char *key,
                                             const char *newvalue);
+
+/** Routine to set a pointer value for a given key.
+ * \param [in] kv           Valid key-value table.
+ * \param [in] key          Non-NULL key to insert or replace.
+ *                          If it already exists, it must be of type pointer.
+ * \param [in] newvalue     New value will be stored under key.
+ */
 void                sc_keyvalue_set_pointer (sc_keyvalue_t * kv,
                                              const char *key, void *newvalue);
 
@@ -100,9 +206,15 @@ typedef int         (*sc_keyvalue_foreach_t) (const char *key,
                                               type, void *entry,
                                               const void *u);
 
+/** Iterate through all stored key-value pairs.
+ * \param [in] kv               Valid key-value container.
+ * \param [in] fn               Function to call on each key-value pair.
+ * \param [in,out] user_data    This pointer is passed through to \b fn.
+ */
 void                sc_keyvalue_foreach (sc_keyvalue_t * kv,
                                          sc_keyvalue_foreach_t fn,
                                          void *user_data);
+
 SC_EXTERN_C_END;
 
 #endif /* !SC_KEYVALUE_H */
diff --git a/sc/src/sc_lapack.c b/sc/src/sc_lapack.c
index 5375d19..b75b832 100644
--- a/sc/src/sc_lapack.c
+++ b/sc/src/sc_lapack.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -27,7 +28,7 @@ const char          sc_jobzchar[] = { 'N', 'V', '?' };
 #ifndef SC_WITH_LAPACK
 
 int
-sc_lapack_nonimplemented ()
+sc_lapack_nonimplemented (SC_NOARGS)
 {
   SC_ABORT ("LAPACK not compiled in this configuration");
   return 0;
diff --git a/sc/src/sc_lapack.h b/sc/src/sc_lapack.h
index cfc9850..bc9d299 100644
--- a/sc/src/sc_lapack.h
+++ b/sc/src/sc_lapack.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -40,13 +41,26 @@ extern const char   sc_jobzchar[];
 #ifdef SC_WITH_LAPACK
 
 #ifndef SC_F77_FUNC
+#if defined(__bgq__)            /* && defined(__HAVE_ESSL) */
+#define SC_F77_FUNC(small,CAPS) small
+/* TODO - FIX THIS FOR NOW WE DO NOT USE ESSL
+  #   define SC_F77_FUNC(small,CAPS) small ## _
+*/
+#define SC_F77_FUNC_NOESSL(small,CAPS) small
+#else
 #define SC_F77_FUNC(small,CAPS) small ## _
 #endif
+#endif /* SC_F77_FUNC */
 
 #define SC_LAPACK_DGELS   SC_F77_FUNC(dgels,DGELS)
+#define SC_LAPACK_DGESV   SC_F77_FUNC(dgesv,DGESV)
 #define SC_LAPACK_DGETRF  SC_F77_FUNC(dgetrf,DGETRF)
 #define SC_LAPACK_DGETRS  SC_F77_FUNC(dgetrs,DGETRS)
+#if defined(__bgq__)            /* && define(__HAVE_ESSL) */
+#define SC_LAPACK_DSTEV   SC_F77_FUNC_NOESSL(dstev,DSTEV)
+#else
 #define SC_LAPACK_DSTEV   SC_F77_FUNC(dstev,DSTEV)
+#endif
 #define SC_LAPACK_DTRSM   SC_F77_FUNC(dtrsm,DTRSM)
 #define SC_LAPACK_DLAIC1  SC_F77_FUNC(dlaic1,DLAIC1)
 #define SC_LAPACK_ILAENV  SC_F77_FUNC(ilaenv,ILAENV)
@@ -58,6 +72,14 @@ void                SC_LAPACK_DGELS (const char *trans,
                                      const sc_bint_t * ldb, double *work,
                                      const sc_bint_t * lwork,
                                      sc_bint_t * info);
+
+void                SC_LAPACK_DGESV (const sc_bint_t * n,
+                                     const sc_bint_t * nrhs,
+                                     double *a, const sc_bint_t * lda,
+                                     sc_bint_t * ipiv,
+                                     double *b, const sc_bint_t * ldb,
+                                     sc_bint_t * info);
+
 void                SC_LAPACK_DGETRF (const sc_bint_t * m,
                                       const sc_bint_t * n, double *a,
                                       const sc_bint_t * lda, sc_bint_t * ipiv,
@@ -110,6 +132,7 @@ int                 SC_LAPACK_ILAENV (const sc_bint_t * ispec,
 #else /* !SC_WITH_LAPACK */
 
 #define SC_LAPACK_DGELS    (void) sc_lapack_nonimplemented
+#define SC_LAPACK_DGESV    (void) sc_lapack_nonimplemented
 #define SC_LAPACK_DGETRF   (void) sc_lapack_nonimplemented
 #define SC_LAPACK_DGETRS   (void) sc_lapack_nonimplemented
 #define SC_LAPACK_DSTEV    (void) sc_lapack_nonimplemented
@@ -117,7 +140,7 @@ int                 SC_LAPACK_ILAENV (const sc_bint_t * ispec,
 #define SC_LAPACK_DLAIC1   (void) sc_lapack_nonimplemented
 #define SC_LAPACK_ILAENV   (int)  sc_lapack_nonimplemented
 
-int                 sc_lapack_nonimplemented ();
+int                 sc_lapack_nonimplemented (SC_NOARGS);
 
 #endif
 
diff --git a/sc/src/sc_lua.h b/sc/src/sc_lua.h
index e1813aa..ce06b2b 100644
--- a/sc/src/sc_lua.h
+++ b/sc/src/sc_lua.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_mpi.c b/sc/src/sc_mpi.c
index dc1d31d..6ddbe15 100644
--- a/sc/src/sc_mpi.c
+++ b/sc/src/sc_mpi.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -92,6 +93,24 @@ sc_MPI_Comm_dup (sc_MPI_Comm comm, sc_MPI_Comm * newcomm)
 }
 
 int
+sc_MPI_Comm_create (sc_MPI_Comm comm, sc_MPI_Group group,
+                    sc_MPI_Comm * newcomm)
+{
+  *newcomm = sc_MPI_COMM_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Comm_split (sc_MPI_Comm comm, int color, int key,
+                   sc_MPI_Comm * newcomm)
+{
+  *newcomm = sc_MPI_COMM_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
 sc_MPI_Comm_free (sc_MPI_Comm * comm)
 {
   *comm = sc_MPI_COMM_NULL;
@@ -116,6 +135,144 @@ sc_MPI_Comm_rank (sc_MPI_Comm comm, int *rank)
 }
 
 int
+sc_MPI_Comm_compare (sc_MPI_Comm comm1, sc_MPI_Comm comm2, int *result)
+{
+  if (comm1 == comm2) {
+    *result = sc_MPI_IDENT;
+  }
+  else {
+    *result = sc_MPI_UNEQUAL;
+  }
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Comm_group (sc_MPI_Comm comm, sc_MPI_Group * group)
+{
+  *group = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_free (sc_MPI_Group * group)
+{
+  *group = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_size (sc_MPI_Group group, int *size)
+{
+  if (group == sc_MPI_GROUP_NULL) {
+    return sc_MPI_ERR_GROUP;
+  }
+  else {
+    *size = 0;
+    return sc_MPI_SUCCESS;
+  }
+}
+
+int
+sc_MPI_Group_rank (sc_MPI_Group group, int *rank)
+{
+  *rank = sc_MPI_UNDEFINED;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_translate_ranks (sc_MPI_Group group1, int n, int *ranks1,
+                              sc_MPI_Group group2, int *ranks2)
+{
+  int                 i;
+
+  for (i = 0; i < n; i++) {
+    ranks2[i] = sc_MPI_UNDEFINED;
+  }
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_compare (sc_MPI_Group group1, sc_MPI_Group group2, int *result)
+{
+  if (group1 == group2) {
+    *result = sc_MPI_IDENT;
+  }
+  else {
+    *result = sc_MPI_UNEQUAL;
+  }
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_union (sc_MPI_Group group1, sc_MPI_Group group2,
+                    sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_intersection (sc_MPI_Group group1, sc_MPI_Group group2,
+                           sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_difference (sc_MPI_Group group1, sc_MPI_Group group2,
+                         sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_incl (sc_MPI_Group group, int n, int *ranks,
+                   sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_excl (sc_MPI_Group group, int n, int *ranks,
+                   sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_range_incl (sc_MPI_Group group, int n, int ranges[][3],
+                         sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
+sc_MPI_Group_range_excl (sc_MPI_Group group, int n, int ranges[][3],
+                         sc_MPI_Group * newgroup)
+{
+  *newgroup = sc_MPI_GROUP_NULL;
+
+  return sc_MPI_SUCCESS;
+}
+
+int
 sc_MPI_Barrier (sc_MPI_Comm comm)
 {
   return sc_MPI_SUCCESS;
@@ -134,13 +291,18 @@ sc_MPI_Gather (void *p, int np, sc_MPI_Datatype tp,
                void *q, int nq, sc_MPI_Datatype tq, int rank,
                sc_MPI_Comm comm)
 {
-  size_t              lp, lq;
+  size_t              lp;
+#ifdef SC_ENABLE_DEBUG
+  size_t              lq;
+#endif
 
   SC_ASSERT (rank == 0 && np >= 0 && nq >= 0);
 
 /* *INDENT-OFF* horrible indent bug */
   lp = (size_t) np * sc_mpi_sizeof (tp);
+#ifdef SC_ENABLE_DEBUG
   lq = (size_t) nq * sc_mpi_sizeof (tq);
+#endif
 /* *INDENT-ON* */
 
   SC_ASSERT (lp == lq);
@@ -154,15 +316,20 @@ sc_MPI_Gatherv (void *p, int np, sc_MPI_Datatype tp,
                 void *q, int *recvc, int *displ,
                 sc_MPI_Datatype tq, int rank, sc_MPI_Comm comm)
 {
+  size_t              lp;
+#ifdef SC_ENABLE_DEBUG
+  size_t              lq;
   int                 nq;
-  size_t              lp, lq;
 
   nq = recvc[0];
+#endif
   SC_ASSERT (rank == 0 && np >= 0 && nq >= 0);
 
 /* *INDENT-OFF* horrible indent bug */
   lp = (size_t) np * sc_mpi_sizeof (tp);
+#ifdef SC_ENABLE_DEBUG
   lq = (size_t) nq * sc_mpi_sizeof (tq);
+#endif
 /* *INDENT-ON* */
 
   SC_ASSERT (lp == lq);
@@ -187,6 +354,13 @@ sc_MPI_Allgatherv (void *p, int np, sc_MPI_Datatype tp,
 }
 
 int
+sc_MPI_Alltoall (void *p, int np, sc_MPI_Datatype tp,
+                 void *q, int nq, sc_MPI_Datatype tq, sc_MPI_Comm comm)
+{
+  return sc_MPI_Gather (p, np, tp, q, nq, tq, 0, comm);
+}
+
+int
 sc_MPI_Reduce (void *p, void *q, int n, sc_MPI_Datatype t,
                sc_MPI_Op op, int rank, sc_MPI_Comm comm)
 {
@@ -212,6 +386,21 @@ sc_MPI_Allreduce (void *p, void *q, int n, sc_MPI_Datatype t,
 }
 
 int
+sc_MPI_Scan (void *sendbuf, void *recvbuf, int count,
+             sc_MPI_Datatype datatype, sc_MPI_Op op, sc_MPI_Comm comm)
+{
+  return sc_MPI_Reduce (sendbuf, recvbuf, count, datatype, op, 0, comm);
+}
+
+/* Exscan recvbuf undefined on proc 0 */
+int
+sc_MPI_Exscan (void *sendbuf, void *recvbf, int count,
+               sc_MPI_Datatype datatype, sc_MPI_Op op, sc_MPI_Comm comm)
+{
+  return sc_MPI_SUCCESS;
+}
+
+int
 sc_MPI_Recv (void *buf, int count, sc_MPI_Datatype datatype, int source,
              int tag, sc_MPI_Comm comm, sc_MPI_Status * status)
 {
@@ -347,6 +536,8 @@ sc_mpi_sizeof (sc_MPI_Datatype t)
     return sizeof (long);
   if (t == sc_MPI_LONG_LONG_INT)
     return sizeof (long long);
+  if (t == sc_MPI_UNSIGNED_LONG_LONG)
+    return sizeof (unsigned long long);
   if (t == sc_MPI_FLOAT)
     return sizeof (float);
   if (t == sc_MPI_DOUBLE)
@@ -356,3 +547,191 @@ sc_mpi_sizeof (sc_MPI_Datatype t)
 
   SC_ABORT_NOT_REACHED ();
 }
+
+#if defined(SC_ENABLE_MPI)
+
+/* these should be initialized in sc_init() */
+static int          sc_mpi_node_comm_keyval = MPI_KEYVAL_INVALID;
+
+static int
+sc_mpi_node_comms_destroy (MPI_Comm comm, int comm_keyval,
+                           void *attribute_val, void *extra_state)
+{
+  int                 mpiret;
+  MPI_Comm           *node_comms = (MPI_Comm *) attribute_val;
+
+  mpiret = MPI_Comm_free (&node_comms[0]);
+  if (mpiret != MPI_SUCCESS) {
+    return mpiret;
+  }
+  mpiret = MPI_Comm_free (&node_comms[1]);
+  if (mpiret != MPI_SUCCESS) {
+    return mpiret;
+  }
+  mpiret = MPI_Free_mem (node_comms);
+
+  return MPI_SUCCESS;
+}
+
+static int
+sc_mpi_node_comms_copy (MPI_Comm oldcomm, int comm_keyval,
+                        void *extra_state,
+                        void *attribute_val_in,
+                        void *attribute_val_out, int *flag)
+{
+  MPI_Comm           *node_comms_in = (MPI_Comm *) attribute_val_in;
+  MPI_Comm           *node_comms_out;
+  int                 mpiret;
+
+  /* We can't used SC_ALLOC because these might be destroyed after
+   * sc finalizes */
+  mpiret =
+    MPI_Alloc_mem (2 * sizeof (MPI_Comm), MPI_INFO_NULL, &node_comms_out);
+  if (mpiret != MPI_SUCCESS) {
+    return mpiret;
+  }
+
+  mpiret = MPI_Comm_dup (node_comms_in[0], &node_comms_out[0]);
+  if (mpiret != MPI_SUCCESS) {
+    return mpiret;
+  }
+  mpiret = MPI_Comm_dup (node_comms_in[1], &node_comms_out[1]);
+  if (mpiret != MPI_SUCCESS) {
+    return mpiret;
+  }
+
+  *((MPI_Comm **) attribute_val_out) = node_comms_out;
+  *flag = 1;
+
+  return MPI_SUCCESS;
+}
+
+#endif /* SC_ENABLE_MPI */
+
+void
+sc_mpi_comm_attach_node_comms (sc_MPI_Comm comm, int processes_per_node)
+{
+#if defined(SC_ENABLE_MPI)
+  int                 mpiret, rank, size;
+  MPI_Comm           *node_comms, internode, intranode;
+
+  if (sc_mpi_node_comm_keyval == MPI_KEYVAL_INVALID) {
+    /* register the node comm attachment with MPI */
+    mpiret =
+      MPI_Comm_create_keyval (sc_mpi_node_comms_copy,
+                              sc_mpi_node_comms_destroy,
+                              &sc_mpi_node_comm_keyval, NULL);
+    SC_CHECK_MPI (mpiret);
+  }
+  SC_ASSERT (sc_mpi_node_comm_keyval != MPI_KEYVAL_INVALID);
+
+  mpiret = MPI_Comm_size (comm, &size);
+  SC_CHECK_MPI (mpiret);
+  mpiret = MPI_Comm_rank (comm, &rank);
+  SC_CHECK_MPI (mpiret);
+
+  if (processes_per_node < 1) {
+#if !defined(SC_ENABLE_MPICOMMSHARED)
+    SC_ABORT
+      ("Require MPI-3 or greater to automatically determine node communicators");
+#else
+    int                 intrasize, intrarank, maxintrasize, minintrasize;
+
+    mpiret =
+      MPI_Comm_split_type (comm, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
+                           &intranode);
+    SC_CHECK_MPI (mpiret);
+
+    /* We only accept node comms if they are all the same size */
+    mpiret = MPI_Comm_size (intranode, &intrasize);
+    SC_CHECK_MPI (mpiret);
+    mpiret = MPI_Comm_rank (intranode, &intrarank);
+    SC_CHECK_MPI (mpiret);
+
+    mpiret =
+      MPI_Allreduce (&intrasize, &maxintrasize, 1, MPI_INT, MPI_MAX, comm);
+    SC_CHECK_MPI (mpiret);
+    mpiret =
+      MPI_Allreduce (&intrasize, &minintrasize, 1, MPI_INT, MPI_MIN, comm);
+    SC_CHECK_MPI (mpiret);
+
+    if (maxintrasize != minintrasize) {
+      SC_GLOBAL_LDEBUG
+        ("node communicators are not the same size: not attaching\n");
+
+      mpiret = MPI_Comm_free (&intranode);
+      SC_CHECK_MPI (mpiret);
+
+      return;
+    }
+
+    mpiret = MPI_Comm_split (comm, intrarank, rank, &internode);
+    SC_CHECK_MPI (mpiret);
+#endif
+  }
+  else {
+    int                 node, offset;
+
+    SC_ASSERT (!(size % processes_per_node));
+
+    node = rank / processes_per_node;
+    offset = rank % processes_per_node;
+
+    mpiret = MPI_Comm_split (comm, node, offset, &intranode);
+    SC_CHECK_MPI (mpiret);
+
+    mpiret = MPI_Comm_split (comm, offset, node, &internode);
+    SC_CHECK_MPI (mpiret);
+  }
+
+  /* We can't used SC_ALLOC because these might be destroyed after
+   * sc finalizes */
+  mpiret = MPI_Alloc_mem (2 * sizeof (MPI_Comm), MPI_INFO_NULL, &node_comms);
+  SC_CHECK_MPI (mpiret);
+  node_comms[0] = intranode;
+  node_comms[1] = internode;
+
+  mpiret = MPI_Comm_set_attr (comm, sc_mpi_node_comm_keyval, node_comms);
+  SC_CHECK_MPI (mpiret);
+#endif
+}
+
+void
+sc_mpi_comm_detach_node_comms (sc_MPI_Comm comm)
+{
+#if defined(SC_ENABLE_MPI)
+  if (comm != MPI_COMM_NULL) {
+    int                 mpiret;
+
+    mpiret = MPI_Comm_delete_attr (comm, sc_mpi_node_comm_keyval);
+    SC_CHECK_MPI (mpiret);
+  }
+#endif
+}
+
+void
+sc_mpi_comm_get_node_comms (sc_MPI_Comm comm,
+                            sc_MPI_Comm * intranode, sc_MPI_Comm * internode)
+{
+#ifdef SC_ENABLE_MPI
+  int                 mpiret, flag;
+  sc_MPI_Comm        *node_comms;
+#endif
+
+  *intranode = sc_MPI_COMM_NULL;
+  *internode = sc_MPI_COMM_NULL;
+#if defined(SC_ENABLE_MPI)
+  if (sc_mpi_node_comm_keyval == MPI_KEYVAL_INVALID) {
+    SC_GLOBAL_LDEBUG
+      ("Asking for node comms before sc_mpi_comm_attach_node_comms is called\n");
+    return;
+  }
+  mpiret =
+    MPI_Comm_get_attr (comm, sc_mpi_node_comm_keyval, &node_comms, &flag);
+  SC_CHECK_MPI (mpiret);
+  if (flag && node_comms) {
+    *intranode = (MPI_Comm) node_comms[0];
+    *internode = (MPI_Comm) node_comms[1];
+  }
+#endif
+}
diff --git a/sc/src/sc_mpi.h b/sc/src/sc_mpi.h
index 601239d..a28b6c3 100644
--- a/sc/src/sc_mpi.h
+++ b/sc/src/sc_mpi.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -51,14 +52,16 @@ SC_EXTERN_C_BEGIN;
 
 typedef enum
 {
-  SC_TAG_AG_ALLTOALL = 's' + 'c',       /* anything really */
+  SC_TAG_FIRST = 's' + 'c',     /* anything really */
+  SC_TAG_AG_ALLTOALL = SC_TAG_FIRST,
   SC_TAG_AG_RECURSIVE_A,
   SC_TAG_AG_RECURSIVE_B,
   SC_TAG_AG_RECURSIVE_C,
   SC_TAG_NOTIFY_RECURSIVE,
-  SC_TAG_REDUCE,
+  SC_TAG_REDUCE = SC_TAG_NOTIFY_RECURSIVE + 32,
   SC_TAG_PSORT_LO,
-  SC_TAG_PSORT_HI
+  SC_TAG_PSORT_HI,
+  SC_TAG_LAST
 }
 sc_tag_t;
 
@@ -71,6 +74,14 @@ sc_tag_t;
 #define sc_MPI_COMM_WORLD          MPI_COMM_WORLD
 #define sc_MPI_COMM_SELF           MPI_COMM_SELF
 
+#define sc_MPI_GROUP_NULL          MPI_GROUP_NULL
+#define sc_MPI_GROUP_EMPTY         MPI_GROUP_EMPTY
+
+#define sc_MPI_IDENT               MPI_IDENT
+#define sc_MPI_CONGRUENT           MPI_CONGRUENT
+#define sc_MPI_SIMILAR             MPI_SIMILAR
+#define sc_MPI_UNEQUAL             MPI_UNEQUAL
+
 #define sc_MPI_ANY_SOURCE          MPI_ANY_SOURCE
 #define sc_MPI_ANY_TAG             MPI_ANY_TAG
 #define sc_MPI_STATUS_IGNORE       MPI_STATUS_IGNORE
@@ -78,6 +89,8 @@ sc_tag_t;
 
 #define sc_MPI_REQUEST_NULL        MPI_REQUEST_NULL
 
+#define sc_MPI_DATATYPE_NULL       MPI_DATATYPE_NULL
+
 #define sc_MPI_CHAR                MPI_CHAR
 #define sc_MPI_SIGNED_CHAR         MPI_SIGNED_CHAR
 #define sc_MPI_UNSIGNED_CHAR       MPI_UNSIGNED_CHAR
@@ -89,6 +102,7 @@ sc_tag_t;
 #define sc_MPI_LONG                MPI_LONG
 #define sc_MPI_UNSIGNED_LONG       MPI_UNSIGNED_LONG
 #define sc_MPI_LONG_LONG_INT       MPI_LONG_LONG_INT
+#define sc_MPI_UNSIGNED_LONG_LONG  MPI_UNSIGNED_LONG_LONG
 #define sc_MPI_FLOAT               MPI_FLOAT
 #define sc_MPI_DOUBLE              MPI_DOUBLE
 #define sc_MPI_LONG_DOUBLE         MPI_LONG_DOUBLE
@@ -109,9 +123,12 @@ sc_tag_t;
 
 #define sc_MPI_UNDEFINED           MPI_UNDEFINED
 
+#define sc_MPI_ERR_GROUP           MPI_ERR_GROUP
+
 /* types */
 
 #define sc_MPI_Comm                MPI_Comm
+#define sc_MPI_Group               MPI_Group
 #define sc_MPI_Datatype            MPI_Datatype
 #define sc_MPI_Op                  MPI_Op
 #define sc_MPI_Request             MPI_Request
@@ -124,17 +141,36 @@ sc_tag_t;
 #define sc_MPI_Finalize            MPI_Finalize
 #define sc_MPI_Abort               MPI_Abort
 #define sc_MPI_Comm_dup            MPI_Comm_dup
+#define sc_MPI_Comm_create         MPI_Comm_create
+#define sc_MPI_Comm_split          MPI_Comm_split
 #define sc_MPI_Comm_free           MPI_Comm_free
 #define sc_MPI_Comm_size           MPI_Comm_size
 #define sc_MPI_Comm_rank           MPI_Comm_rank
+#define sc_MPI_Comm_compare        MPI_Comm_compare
+#define sc_MPI_Comm_group          MPI_Comm_group
+#define sc_MPI_Group_free          MPI_Group_free
+#define sc_MPI_Group_size          MPI_Group_size
+#define sc_MPI_Group_rank          MPI_Group_rank
+#define sc_MPI_Group_translate_ranks MPI_Group_translate_ranks
+#define sc_MPI_Group_compare       MPI_Group_compare
+#define sc_MPI_Group_union         MPI_Group_union
+#define sc_MPI_Group_intersection  MPI_Group_intersection
+#define sc_MPI_Group_difference    MPI_Group_difference
+#define sc_MPI_Group_incl          MPI_Group_incl
+#define sc_MPI_Group_excl          MPI_Group_excl
+#define sc_MPI_Group_range_incl    MPI_Group_range_incl
+#define sc_MPI_Group_range_excl    MPI_Group_range_excl
 #define sc_MPI_Barrier             MPI_Barrier
 #define sc_MPI_Bcast               MPI_Bcast
 #define sc_MPI_Gather              MPI_Gather
 #define sc_MPI_Gatherv             MPI_Gatherv
 #define sc_MPI_Allgather           MPI_Allgather
 #define sc_MPI_Allgatherv          MPI_Allgatherv
+#define sc_MPI_Alltoall            MPI_Alltoall
 #define sc_MPI_Reduce              MPI_Reduce
 #define sc_MPI_Allreduce           MPI_Allreduce
+#define sc_MPI_Scan                MPI_Scan
+#define sc_MPI_Exscan              MPI_Exscan
 #define sc_MPI_Recv                MPI_Recv
 #define sc_MPI_Irecv               MPI_Irecv
 #define sc_MPI_Send                MPI_Send
@@ -156,6 +192,14 @@ sc_tag_t;
 #define sc_MPI_COMM_WORLD          ((sc_MPI_Comm) 0x44000000)
 #define sc_MPI_COMM_SELF           ((sc_MPI_Comm) 0x44000001)
 
+#define sc_MPI_GROUP_NULL          ((sc_MPI_Group) 0x54000000)  /* TODO change val */
+#define sc_MPI_GROUP_EMPTY         ((sc_MPI_Group) 0x54000001)  /* TODO change val */
+
+#define sc_MPI_IDENT               (1)  /* TODO change val */
+#define sc_MPI_CONGRUENT           (2)  /* TODO change val */
+#define sc_MPI_SIMILAR             (3)  /* TODO change val */
+#define sc_MPI_UNEQUAL             (-1) /* TODO change val */
+
 #define sc_MPI_ANY_SOURCE          (-2)
 #define sc_MPI_ANY_TAG             (-1)
 #define sc_MPI_STATUS_IGNORE       (sc_MPI_Status *) 1
@@ -163,6 +207,8 @@ sc_tag_t;
 
 #define sc_MPI_REQUEST_NULL        ((sc_MPI_Request) 0x2c000000)
 
+#define sc_MPI_DATATYPE_NULL       ((sc_MPI_Datatype) 0x4c000000)
+
 #define sc_MPI_CHAR                ((sc_MPI_Datatype) 0x4c000101)
 #define sc_MPI_SIGNED_CHAR         ((sc_MPI_Datatype) 0x4c000118)
 #define sc_MPI_UNSIGNED_CHAR       ((sc_MPI_Datatype) 0x4c000102)
@@ -174,6 +220,7 @@ sc_tag_t;
 #define sc_MPI_LONG                ((sc_MPI_Datatype) 0x4c000407)
 #define sc_MPI_UNSIGNED_LONG       ((sc_MPI_Datatype) 0x4c000408)
 #define sc_MPI_LONG_LONG_INT       ((sc_MPI_Datatype) 0x4c000809)
+#define sc_MPI_UNSIGNED_LONG_LONG  ((sc_MPI_Datatype) 0x4c000409)
 #define sc_MPI_FLOAT               ((sc_MPI_Datatype) 0x4c00040a)
 #define sc_MPI_DOUBLE              ((sc_MPI_Datatype) 0x4c00080b)
 #define sc_MPI_LONG_DOUBLE         ((sc_MPI_Datatype) 0x4c000c0c)
@@ -194,9 +241,12 @@ sc_tag_t;
 
 #define sc_MPI_UNDEFINED           (-32766)
 
+#define sc_MPI_ERR_GROUP           (-123456)    /* TODO change val */
+
 /* types */
 
 typedef int         sc_MPI_Comm;
+typedef int         sc_MPI_Group;
 typedef int         sc_MPI_Datatype;
 typedef int         sc_MPI_Op;
 typedef int         sc_MPI_Request;
@@ -220,9 +270,35 @@ int                 sc_MPI_Abort (sc_MPI_Comm, int)
   __attribute__ ((noreturn));
 
 int                 sc_MPI_Comm_dup (sc_MPI_Comm, sc_MPI_Comm *);
+int                 sc_MPI_Comm_create (sc_MPI_Comm, sc_MPI_Group,
+                                        sc_MPI_Comm *);
+int                 sc_MPI_Comm_split (sc_MPI_Comm, int, int, sc_MPI_Comm *);
 int                 sc_MPI_Comm_free (sc_MPI_Comm *);
 int                 sc_MPI_Comm_size (sc_MPI_Comm, int *);
 int                 sc_MPI_Comm_rank (sc_MPI_Comm, int *);
+int                 sc_MPI_Comm_compare (sc_MPI_Comm, sc_MPI_Comm, int *);
+int                 sc_MPI_Comm_group (sc_MPI_Comm, sc_MPI_Group *);
+
+int                 sc_MPI_Group_free (sc_MPI_Group *);
+int                 sc_MPI_Group_size (sc_MPI_Group, int *);
+int                 sc_MPI_Group_rank (sc_MPI_Group, int *);
+int                 sc_MPI_Group_translate_ranks (sc_MPI_Group, int, int *,
+                                                  sc_MPI_Group, int *);
+int                 sc_MPI_Group_compare (sc_MPI_Group, sc_MPI_Group, int *);
+int                 sc_MPI_Group_union (sc_MPI_Group, sc_MPI_Group,
+                                        sc_MPI_Group *);
+int                 sc_MPI_Group_intersection (sc_MPI_Group, sc_MPI_Group,
+                                               sc_MPI_Group *);
+int                 sc_MPI_Group_difference (sc_MPI_Group, sc_MPI_Group,
+                                             sc_MPI_Group *);
+int                 sc_MPI_Group_incl (sc_MPI_Group, int, int *,
+                                       sc_MPI_Group *);
+int                 sc_MPI_Group_excl (sc_MPI_Group, int, int *,
+                                       sc_MPI_Group *);
+int                 sc_MPI_Group_range_incl (sc_MPI_Group, int,
+                                             int ranges[][3], sc_MPI_Group *);
+int                 sc_MPI_Group_range_excl (sc_MPI_Group, int,
+                                             int ranges[][3], sc_MPI_Group *);
 
 int                 sc_MPI_Barrier (sc_MPI_Comm);
 int                 sc_MPI_Bcast (void *, int, sc_MPI_Datatype, int,
@@ -237,10 +313,16 @@ int                 sc_MPI_Allgather (void *, int, sc_MPI_Datatype, void *,
 int                 sc_MPI_Allgatherv (void *, int, sc_MPI_Datatype, void *,
                                        int *, int *, sc_MPI_Datatype,
                                        sc_MPI_Comm);
+int                 sc_MPI_Alltoall (void *, int, sc_MPI_Datatype, void *,
+                                     int, sc_MPI_Datatype, sc_MPI_Comm);
 int                 sc_MPI_Reduce (void *, void *, int, sc_MPI_Datatype,
                                    sc_MPI_Op, int, sc_MPI_Comm);
 int                 sc_MPI_Allreduce (void *, void *, int, sc_MPI_Datatype,
                                       sc_MPI_Op, sc_MPI_Comm);
+int                 sc_MPI_Scan (void *, void *, int, sc_MPI_Datatype,
+                                 sc_MPI_Op, sc_MPI_Comm);
+int                 sc_MPI_Exscan (void *, void *, int, sc_MPI_Datatype,
+                                   sc_MPI_Op, sc_MPI_Comm);
 
 double              sc_MPI_Wtime (void);
 
@@ -296,6 +378,39 @@ int                 sc_MPI_Init_thread (int *argc, char ***argv,
  */
 size_t              sc_mpi_sizeof (sc_MPI_Datatype t);
 
+/** Compute ``sc_intranode_comm'' and ``sc_internode_comm''
+ * communicators and attach them to the current communicator.  This split
+ * takes \a processes_per_node passed by the user at face value: there is no
+ * hardware checking to see if this is the true affinity.
+ *
+ * \param [in/out] comm                 MPI communicator
+ * \param [in]     processes_per_node   the size of the intranode
+ *                                      communicators. if < 1,
+ *                                      sc will try to determine the correct
+ *                                      shared memory communicators.
+ */
+void                sc_mpi_comm_attach_node_comms (sc_MPI_Comm comm,
+                                                   int processes_per_node);
+
+/** Destroy ``sc_intranode_comm'' and ``sc_internode_comm''
+ * communicators that are stored as attributes to communicator ``comm''.
+ * This routine enforces a call to the destroy callback for these attributes.
+ *
+ * \param [in/out] comm                 MPI communicator
+ */
+void                sc_mpi_comm_detach_node_comms (sc_MPI_Comm comm);
+
+/** Get the communicators computed in sc_mpi_comm_attach_node_comms() if they
+ * exist; return sc_MPI_COMM_NULL otherwise.
+ *
+ * \param[in] comm            Super communicator
+ * \param[out] intranode      intranode communicator
+ * \param[out] internode      internode communicator
+ */
+void                sc_mpi_comm_get_node_comms (sc_MPI_Comm comm,
+                                                sc_MPI_Comm * intranode,
+                                                sc_MPI_Comm * internode);
+
 SC_EXTERN_C_END;
 
 #endif /* !SC_MPI_H */
diff --git a/sc/src/sc_notify.c b/sc/src/sc_notify.c
index 0904890..3ef1cd9 100644
--- a/sc/src/sc_notify.c
+++ b/sc/src/sc_notify.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_notify.h b/sc/src/sc_notify.h
index 0fb9bcb..9f5ddb9 100644
--- a/sc/src/sc_notify.h
+++ b/sc/src/sc_notify.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_obstack.h b/sc/src/sc_obstack.h
index 095d2af..1f85b7a 100644
--- a/sc/src/sc_obstack.h
+++ b/sc/src/sc_obstack.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_options.c b/sc/src/sc_options.c
index ad83d9e..ff11080 100644
--- a/sc/src/sc_options.c
+++ b/sc/src/sc_options.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -26,8 +27,54 @@
 
 #include <errno.h>
 
+typedef enum
+{
+  SC_OPTION_SWITCH,
+  SC_OPTION_BOOL,
+  SC_OPTION_INT,
+  SC_OPTION_SIZE_T,
+  SC_OPTION_DOUBLE,
+  SC_OPTION_STRING,
+  SC_OPTION_INIFILE,
+  SC_OPTION_CALLBACK,
+  SC_OPTION_KEYVALUE
+}
+sc_option_type_t;
+
+typedef struct
+{
+  sc_option_type_t    opt_type;
+  int                 opt_char;
+  const char         *opt_name;
+  void               *opt_var;
+  void                (*opt_fn) (void);
+  int                 has_arg;
+  int                 called;           /**< set to 0 and ignored */
+  const char         *help_string;
+  char               *string_value;     /**< set on call but ignored */
+  void               *user_data;
+}
+sc_option_item_t;
+
+struct sc_options
+{
+  char                program_path[BUFSIZ];
+  const char         *program_name;
+  sc_array_t         *option_items;
+  int                 space_type;
+  int                 space_help;
+  int                 args_alloced;
+  int                 first_arg;
+  int                 argc;
+  char              **argv;
+  sc_array_t         *subopt_names;
+};
+
 static char        *sc_iniparser_invalid_key = (char *) -1;
 
+static const int    sc_options_space_type = 20;
+static const int    sc_options_space_help = 32;
+
 static int
 sc_iniparser_getint (dictionary * d, const char *key, int notfound,
                      int *iserror)
@@ -139,11 +186,14 @@ sc_options_new (const char *program_path)
   opt->argc = 0;
   opt->argv = NULL;
 
+  /* set default spacing for printing option summary */
+  sc_options_set_spacing (opt, -1, -1);
+
   return opt;
 }
 
-void
-sc_options_destroy (sc_options_t * opt)
+static void
+sc_options_destroy_internal (sc_options_t * opt, int deep)
 {
   size_t              iz;
   sc_array_t         *items = opt->option_items;
@@ -154,6 +204,9 @@ sc_options_destroy (sc_options_t * opt)
 
   for (iz = 0; iz < count; ++iz) {
     item = (sc_option_item_t *) sc_array_index (items, iz);
+    if (deep && item->opt_type == SC_OPTION_KEYVALUE) {
+      sc_keyvalue_destroy ((sc_keyvalue_t *) item->user_data);
+    }
     SC_FREE (item->string_value);
   }
 
@@ -171,6 +224,27 @@ sc_options_destroy (sc_options_t * opt)
 }
 
 void
+sc_options_destroy_deep (sc_options_t * opt)
+{
+  sc_options_destroy_internal (opt, 1);
+}
+
+void
+sc_options_destroy (sc_options_t * opt)
+{
+  sc_options_destroy_internal (opt, 0);
+}
+
+void
+sc_options_set_spacing (sc_options_t * opt, int space_type, int space_help)
+{
+  SC_ASSERT (opt != NULL);
+
+  opt->space_type = space_type < 0 ? sc_options_space_type : space_type;
+  opt->space_help = space_help < 0 ? sc_options_space_help : space_help;
+}
+
+void
 sc_options_add_switch (sc_options_t * opt, int opt_char,
                        const char *opt_name,
                        int *variable, const char *help_string)
@@ -375,6 +449,39 @@ sc_options_add_callback (sc_options_t * opt, int opt_char,
 }
 
 void
+sc_options_add_keyvalue (sc_options_t * opt,
+                         int opt_char, const char *opt_name,
+                         int *variable, const char *init_value,
+                         sc_keyvalue_t * keyvalue, const char *help_string)
+{
+  sc_option_item_t   *item;
+
+  SC_ASSERT (opt_char != '\0' || opt_name != NULL);
+  SC_ASSERT (opt_name == NULL || opt_name[0] != '-');
+
+  /* we do not accept an invalid default value */
+  SC_ASSERT (variable != NULL);
+  SC_ASSERT (init_value != NULL);
+  SC_ASSERT (keyvalue != NULL);
+
+  item = (sc_option_item_t *) sc_array_push (opt->option_items);
+
+  item->opt_type = SC_OPTION_KEYVALUE;
+  item->opt_char = opt_char;
+  item->opt_name = opt_name;
+  item->opt_var = variable;
+  item->opt_fn = NULL;
+  item->has_arg = 1;
+  item->called = 0;
+  item->help_string = help_string;
+  item->user_data = keyvalue;
+
+  /* we expect that the key points to a valid integer entry by design */
+  *variable = sc_keyvalue_get_int_check (keyvalue, init_value, NULL);
+  item->string_value = SC_STRDUP (init_value);
+}
+
+void
 sc_options_add_suboptions (sc_options_t * opt,
                            sc_options_t * subopt, const char *prefix)
 {
@@ -434,6 +541,13 @@ sc_options_add_suboptions (sc_options_t * opt,
                                (sc_options_callback_t) item->opt_fn,
                                item->user_data, item->help_string);
       break;
+    case SC_OPTION_KEYVALUE:
+      SC_ASSERT (item->string_value != NULL);
+      sc_options_add_keyvalue (opt, '\0', *name,
+                               (int *) item->opt_var, item->string_value,
+                               (sc_keyvalue_t *) item->user_data,
+                               item->help_string);
+      break;
     default:
       SC_ABORT_NOT_REACHED ();
     }
@@ -449,8 +563,8 @@ sc_options_print_usage (int package_id, int log_priority,
   sc_array_t         *items = opt->option_items;
   size_t              count = items->elem_count;
   sc_option_item_t   *item;
-  const char         *provide_short;
-  const char         *provide_long;
+  const char         *provide;
+  const char         *separator;
   char                outbuf[BUFSIZ];
   char               *copy, *tok;
 
@@ -464,65 +578,68 @@ sc_options_print_usage (int package_id, int log_priority,
 
   for (iz = 0; iz < count; ++iz) {
     item = (sc_option_item_t *) sc_array_index (items, iz);
-    provide_short = "";
-    provide_long = "";
+    provide = "";
     switch (item->opt_type) {
     case SC_OPTION_SWITCH:
       break;
     case SC_OPTION_BOOL:
-      provide_short = " [0fFnN1tTyY]";
-      provide_long = "[=0fFnN1tTyY]";
+      provide = "[0fFnN1tTyY]";
       break;
     case SC_OPTION_INT:
-      provide_short = " <INT>";
-      provide_long = "=<INT>";
+      provide = "<INT>";
       break;
     case SC_OPTION_SIZE_T:
-      provide_short = " <SIZE_T>";
-      provide_long = "=<SIZE_T>";
+      provide = "<SIZE_T>";
       break;
     case SC_OPTION_DOUBLE:
-      provide_short = " <REAL>";
-      provide_long = "=<REAL>";
+      provide = "<REAL>";
       break;
     case SC_OPTION_STRING:
-      provide_short = " <STRING>";
-      provide_long = "=<STRING>";
+      provide = "<STRING>";
       break;
     case SC_OPTION_INIFILE:
-      provide_short = " <INIFILE>";
-      provide_long = "=<INIFILE>";
+      provide = "<FILE>";
       break;
     case SC_OPTION_CALLBACK:
       if (item->has_arg) {
-        provide_short = " <ARG>";
-        provide_long = "=<ARG>";
+        provide = "<ARG>";
       }
       break;
+    case SC_OPTION_KEYVALUE:
+      provide = "<CHOICE>";
+      break;
     default:
       SC_ABORT_NOT_REACHED ();
     }
+#if 0
+    separator = item->has_arg ? "=" : "";
+#else
+    separator = "";
+#endif
     outbuf[0] = '\0';
     printed = 0;
     if (item->opt_char != '\0' && item->opt_name != NULL) {
-      printed = snprintf (outbuf, BUFSIZ, "   -%c%s | --%s%s",
-                          item->opt_char, provide_short,
-                          item->opt_name, provide_long);
+      printed +=
+        snprintf (outbuf + printed, BUFSIZ - printed, "   -%c | --%s%s",
+                  item->opt_char, item->opt_name, separator);
     }
     else if (item->opt_char != '\0') {
-      printed = snprintf (outbuf, BUFSIZ, "   -%c%s",
-                          item->opt_char, provide_short);
+      printed += snprintf (outbuf + printed, BUFSIZ - printed, "   -%c",
+                           item->opt_char);
     }
     else if (item->opt_name != NULL) {
-      printed = snprintf (outbuf, BUFSIZ, "   --%s%s",
-                          item->opt_name, provide_long);
+      printed += snprintf (outbuf + printed, BUFSIZ - printed, "   --%s%s",
+                           item->opt_name, separator);
     }
     else {
       SC_ABORT_NOT_REACHED ();
     }
+    printed += snprintf (outbuf + printed, BUFSIZ - printed, "%*s%s",
+                         SC_MAX (1, opt->space_type - printed), "", provide);
     if (item->help_string != NULL) {
-      snprintf (outbuf + printed, BUFSIZ - printed, "%*s%s",
-                SC_MAX (1, 40 - printed), "", item->help_string);
+      printed += snprintf (outbuf + printed, BUFSIZ - printed, "%*s%s",
+                           SC_MAX (1, opt->space_help - printed), "",
+                           item->help_string);
     }
     SC_GEN_LOGF (package_id, SC_LC_GLOBAL, log_priority, "%s\n", outbuf);
   }
@@ -555,15 +672,21 @@ sc_options_print_summary (int package_id, int log_priority,
 
   for (iz = 0; iz < count; ++iz) {
     item = (sc_option_item_t *) sc_array_index (items, iz);
-    if (item->opt_type == SC_OPTION_INIFILE) {
+    if (item->opt_type == SC_OPTION_INIFILE ||
+        item->opt_type == SC_OPTION_CALLBACK) {
       continue;
     }
+    printed = 0;
     if (item->opt_name == NULL) {
-      printed = snprintf (outbuf, BUFSIZ, "   -%c: ", item->opt_char);
+      printed += snprintf (outbuf + printed, BUFSIZ - printed, "   -%c",
+                           item->opt_char);
     }
     else {
-      printed = snprintf (outbuf, BUFSIZ, "   %s: ", item->opt_name);
+      printed += snprintf (outbuf + printed, BUFSIZ - printed, "   %s",
+                           item->opt_name);
     }
+    printed += snprintf (outbuf + printed, BUFSIZ - printed, "%*s",
+                         SC_MAX (1, opt->space_type - printed), "");
     switch (item->opt_type) {
     case SC_OPTION_SWITCH:
       bvalue = *(int *) item->opt_var;
@@ -598,6 +721,7 @@ sc_options_print_summary (int package_id, int log_priority,
       printed += snprintf (outbuf + printed, BUFSIZ - printed,
                            "%s", string_val);
       break;
+#if 0
     case SC_OPTION_CALLBACK:
       if (item->called) {
         string_val = item->has_arg ? item->string_value : "true";
@@ -608,6 +732,12 @@ sc_options_print_summary (int package_id, int log_priority,
       printed += snprintf (outbuf + printed, BUFSIZ - printed,
                            "%s", string_val);
       break;
+#endif
+    case SC_OPTION_KEYVALUE:
+      SC_ASSERT (item->string_value != NULL);
+      printed += snprintf (outbuf + printed, BUFSIZ - printed,
+                           "%s", item->string_value);
+      break;
     default:
       SC_ABORT_NOT_REACHED ();
     }
@@ -650,7 +780,6 @@ sc_options_load (int package_id, int err_priority,
   size_t             *zvalue;
   const char         *s, *key;
   char                skey[BUFSIZ], lkey[BUFSIZ];
-  sc_options_callback_t fn;
 
   dict = iniparser_load (inifile);
   if (dict == NULL) {
@@ -661,7 +790,8 @@ sc_options_load (int package_id, int err_priority,
 
   for (iz = 0; iz < count; ++iz) {
     item = (sc_option_item_t *) sc_array_index (items, iz);
-    if (item->opt_type == SC_OPTION_INIFILE) {
+    if (item->opt_type == SC_OPTION_INIFILE ||
+        item->opt_type == SC_OPTION_CALLBACK) {
       continue;
     }
 
@@ -761,6 +891,7 @@ sc_options_load (int package_id, int err_priority,
         *(const char **) item->opt_var = item->string_value = SC_STRDUP (s);
       }
       break;
+#if 0
     case SC_OPTION_CALLBACK:
       if (item->has_arg) {
         s = iniparser_getstring (dict, key, NULL);
@@ -782,6 +913,27 @@ sc_options_load (int package_id, int err_priority,
         return -1;
       }
       break;
+#endif
+    case SC_OPTION_KEYVALUE:
+      SC_ASSERT (item->string_value != NULL);
+      s = iniparser_getstring (dict, key, NULL);
+      if (s != NULL) {
+        /* lookup the key and see if the result is valid */
+        iserror = *(ivalue = (int *) item->opt_var);
+        *ivalue = sc_keyvalue_get_int_check ((sc_keyvalue_t *)
+                                             item->user_data, s, &iserror);
+        if (iserror) {
+          /* key not found or of the wrong type; this cannot be ignored */
+          SC_GEN_LOGF (package_id, SC_LC_GLOBAL, err_priority,
+                       "Invalid key %s for option %s in file: %s\n",
+                       s, key, inifile);
+          iniparser_freedict (dict);
+          return -1;
+        }
+        SC_FREE (item->string_value);
+        item->string_value = SC_STRDUP (s);
+      }
+      break;
     default:
       SC_ABORT_NOT_REACHED ();
     }
@@ -837,10 +989,8 @@ sc_options_save (int package_id, int err_priority,
     if (item->opt_type == SC_OPTION_STRING && item->string_value == NULL) {
       continue;
     }
-    if (item->opt_type == SC_OPTION_INIFILE) {
-      continue;
-    }
-    if (item->opt_type == SC_OPTION_CALLBACK && !item->called) {
+    if (item->opt_type == SC_OPTION_INIFILE ||
+        item->opt_type == SC_OPTION_CALLBACK) {
       continue;
     }
 
@@ -917,6 +1067,7 @@ sc_options_save (int package_id, int err_priority,
     case SC_OPTION_STRING:
       retval = fprintf (file, "%s\n", item->string_value);
       break;
+#if 0
     case SC_OPTION_CALLBACK:
       if (item->has_arg) {
         SC_ASSERT (item->string_value != NULL);
@@ -926,6 +1077,11 @@ sc_options_save (int package_id, int err_priority,
         retval = fprintf (file, "%s\n", "true");
       }
       break;
+#endif
+    case SC_OPTION_KEYVALUE:
+      SC_ASSERT (item->string_value != NULL);
+      retval = fprintf (file, "%s\n", item->string_value);
+      break;
     default:
       SC_ABORT_NOT_REACHED ();
     }
@@ -970,10 +1126,11 @@ int
 sc_options_parse (int package_id, int err_priority, sc_options_t * opt,
                   int argc, char **argv)
 {
-  int                 retval;
+  int                 retval, iserror;
   int                 position, printed;
   int                 c, option_index;
   int                 item_index = -1;
+  int                *ivalue;
   size_t              iz;
   long                ilong;
   long long           ilonglong;
@@ -1019,13 +1176,14 @@ sc_options_parse (int package_id, int err_priority, sc_options_t * opt,
       break;
     }
     if (c == '?') {             /* invalid option */
-      if (optopt == 0) {
+      if (optopt == '-' || !isprint (optopt)) {
         SC_GEN_LOG (package_id, SC_LC_GLOBAL, err_priority,
-                    "Encountered invalid long option\n");
+                    "Invalid long option or missing argument\n");
       }
       else {
         SC_GEN_LOGF (package_id, SC_LC_GLOBAL, err_priority,
-                     "Encountered invalid short option: -%c\n", optopt);
+                     "Invalid short option: -%c or missing argument\n",
+                     optopt);
       }
       retval = -1;
       break;
@@ -1136,6 +1294,21 @@ sc_options_parse (int package_id, int err_priority, sc_options_t * opt,
         retval = -1;            /* this ends option processing */
       }
       break;
+    case SC_OPTION_KEYVALUE:
+      SC_ASSERT (item->string_value != NULL);
+      iserror = *(ivalue = (int *) item->opt_var);
+      *ivalue = sc_keyvalue_get_int_check ((sc_keyvalue_t *) item->user_data,
+                                           optarg, &iserror);
+      if (iserror) {
+        SC_GEN_LOGF (package_id, SC_LC_GLOBAL, err_priority,
+                     "Error looking up: %s\n", optarg);
+        retval = -1;            /* this ends option processing */
+      }
+      else {
+        SC_FREE (item->string_value);
+        item->string_value = SC_STRDUP (optarg);
+      }
+      break;
     default:
       SC_ABORT_NOT_REACHED ();
     }
diff --git a/sc/src/sc_options.h b/sc/src/sc_options.h
index c107cde..d3f0677 100644
--- a/sc/src/sc_options.h
+++ b/sc/src/sc_options.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -23,52 +24,21 @@
 #ifndef SC_OPTIONS_H
 #define SC_OPTIONS_H
 
+/** \file sc_options.h
+ * Register and parse command line options and read/write configuration files.
+ */
+
 #include <sc_containers.h>
+#include <sc_keyvalue.h>
 
 SC_EXTERN_C_BEGIN;
 
-typedef enum
-{
-  SC_OPTION_SWITCH,
-  SC_OPTION_BOOL,
-  SC_OPTION_INT,
-  SC_OPTION_SIZE_T,
-  SC_OPTION_DOUBLE,
-  SC_OPTION_STRING,
-  SC_OPTION_INIFILE,
-  SC_OPTION_CALLBACK
-}
-sc_option_type_t;
-
-typedef struct
-{
-  sc_option_type_t    opt_type;
-  int                 opt_char;
-  const char         *opt_name;
-  void               *opt_var;
-  void                (*opt_fn) (void);
-  int                 has_arg;
-  int                 called;
-  const char         *help_string;
-  char               *string_value;
-  void               *user_data;
-}
-sc_option_item_t;
-
-typedef struct
-{
-  char                program_path[BUFSIZ];
-  const char         *program_name;
-  sc_array_t         *option_items;
-  int                 args_alloced;
-  int                 first_arg;
-  int                 argc;
-  char              **argv;
-  sc_array_t         *subopt_names;
-}
-sc_options_t;
+/** The options data structure is opaque. */
+typedef struct sc_options sc_options_t;
 
 /** This callback can be invoked during sc_options_parse.
+ * \param [in] opt      Valid options data structure.
+ *                      This is passed in case a file should be loaded.
  * \param [in] optarg   The option argument or NULL if there is none.
  * \param [in] data     User-defined data passed to sc_options_add_callback.
  * \return              Return 0 if successful, -1 on error.
@@ -76,23 +46,47 @@ sc_options_t;
 typedef int         (*sc_options_callback_t) (sc_options_t * opt,
                                               const char *optarg, void *data);
 
-/**
- * Create an empty options structure.
- * \param [in] program_path   Name or path name of the program.
+/** Create an empty options structure.
+ * \param [in] program_path   Name or path name of the program to display.
+ *                            Usually argv[0] is fine.
+ * \return                    A valid and empty options structure.
  */
 sc_options_t       *sc_options_new (const char *program_path);
 
-/**
- * Destroy the options structure.
+/** Destroy the options structure and all allocated structures contained.
+ * The keyvalue structure passed into sc_keyvalue_add is destroyed.
+ * \param [in,out] opt          This options structure is deallocated,
+ *                              including all key-value containers referenced.
+ * \deprecated                  This function may go away soon.
+ */
+void                sc_options_destroy_deep (sc_options_t * opt);
+
+/** Destroy the options structure.
+ * Whatever has been passed into sc_keyvalue_add is left alone.
+ * \param [in,out] opt          This options structure is deallocated.
  */
 void                sc_options_destroy (sc_options_t * opt);
 
-/**
- * Add a switch option. This option is used without option arguments.
+/** Set the spacing for \ref sc_options_print_summary.
+ * There are two values to be set: the spacing from the beginning of the
+ * printed line to the type of the option variable, and from the beginning
+ * of the printed line to the help string.
+ * \param [in,out] opt          Valid options structure.
+ * \param [in] space_type       Number of spaces to the type display, for
+ *                              example \<INT\>, \<STRING\>, etc.
+ *                              Setting this negative sets the default 20.
+ * \param [in] space_help       Number of space to the help string.
+ *                              Setting this negative sets the default 32.
+ */
+void                sc_options_set_spacing (sc_options_t * opt,
+                                            int space_type, int space_help);
+
+/** Add a switch option. This option is used without option arguments.
  * Every use increments the variable by one.  Its initial value is 0.
  * Either opt_char or opt_name must be valid, that is, not '\0'/NULL.
+ * \param [in,out] opt       A valid options structure.
  * \param [in] opt_char      Short option character, may be '\0'.
- * \param [in] opt_name      Option name without initial dashes, may be NULL.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
  * \param [in] variable      Address of the variable to store the option value.
  * \param [in] help_string   Help string for usage message, may be NULL.
  */
@@ -102,10 +96,17 @@ void                sc_options_add_switch (sc_options_t * opt,
                                            int *variable,
                                            const char *help_string);
 
-/**
- * Add a boolean option. It can be initialized to true or false in the C sense.
- * A use without argument sets it to true.  The argument 0/f/F/n/N sets it
- * to false (0).  The argument 1/t/T/y/Y sets it to true (not 0).
+/** Add a boolean option.
+ * It can be initialized to true or false in the C sense.
+ * Specifying it on the command line without argument sets the option to true.
+ * The argument 0/f/F/n/N sets it to false (0).
+ * The argument 1/t/T/y/Y sets it to true (nonzero).
+ * \param [in,out] opt       A valid options structure.
+ * \param [in] opt_char      Short option character, may be '\0'.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
+ * \param [in] variable      Address of the variable to store the option value.
+ * \param [in] init_value    Initial value to set the option, read as true or false.
+ * \param [in] help_string   Help string for usage message, may be NULL.
  */
 void                sc_options_add_bool (sc_options_t * opt,
                                          int opt_char,
@@ -113,9 +114,13 @@ void                sc_options_add_bool (sc_options_t * opt,
                                          int *variable, int init_value,
                                          const char *help_string);
 
-/**
- * Add an option that takes an integer argument.
- * \param [in] init_value   The initial value of the variable.
+/** Add an option that takes an integer argument.
+ * \param [in,out] opt       A valid options structure.
+ * \param [in] opt_char      Short option character, may be '\0'.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
+ * \param [in] variable      Address of the variable to store the option value.
+ * \param [in] init_value    The initial value of the option variable.
+ * \param [in] help_string   Help string for usage message, may be NULL.
  */
 void                sc_options_add_int (sc_options_t * opt,
                                         int opt_char,
@@ -123,10 +128,14 @@ void                sc_options_add_int (sc_options_t * opt,
                                         int *variable, int init_value,
                                         const char *help_string);
 
-/**
- * Add an option that takes a size_t argument.
- * The value of the size_t must not be greater than LLONG_MAX.
- * \param [in] init_value   The initial value of the variable.
+/** Add an option that takes a size_t argument.
+ * The value of the size_t variable must not be greater than LLONG_MAX.
+ * \param [in,out] opt       A valid options structure.
+ * \param [in] opt_char      Short option character, may be '\0'.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
+ * \param [in] variable      Address of the variable to store the option value.
+ * \param [in] init_value    The initial value of the option variable.
+ * \param [in] help_string   Help string for usage message, may be NULL.
  */
 void                sc_options_add_size_t (sc_options_t * opt,
                                            int opt_char,
@@ -135,9 +144,14 @@ void                sc_options_add_size_t (sc_options_t * opt,
                                            size_t init_value,
                                            const char *help_string);
 
-/**
- * Add an option that takes a double argument.
+/** Add an option that takes a double argument.
  * The double must be in the legal range.  "inf" and "nan" are legal too.
+ * \param [in,out] opt       A valid options structure.
+ * \param [in] opt_char      Short option character, may be '\0'.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
+ * \param [in] variable      Address of the variable to store the option value.
+ * \param [in] init_value    The initial value of the option variable.
+ * \param [in] help_string   Help string for usage message, may be NULL.
  */
 void                sc_options_add_double (sc_options_t * opt,
                                            int opt_char,
@@ -146,11 +160,14 @@ void                sc_options_add_double (sc_options_t * opt,
                                            double init_value,
                                            const char *help_string);
 
-/**
- * Add a string option.
- * \param [in] init_value  The default value of the option may be NULL.
- *                         If not NULL, the value is copied internally.
- * \param [out] variable   Will point to an internal string value.
+/** Add a string option.
+ * \param [in,out] opt       A valid options structure.
+ * \param [in] opt_char      Short option character, may be '\0'.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
+ * \param [in] variable      Address of the variable to store the option value.
+ * \param [in] init_value    This default value of the option may be NULL.
+ *                           If not NULL, the value is copied to internal storage.
+ * \param [in] help_string   Help string for usage message, may be NULL.
  */
 void                sc_options_add_string (sc_options_t * opt,
                                            int opt_char,
@@ -159,21 +176,36 @@ void                sc_options_add_string (sc_options_t * opt,
                                            const char *init_value,
                                            const char *help_string);
 
-/**
- * Add an option to read in a file in .ini format.
+/** Add an option to read in a file in .ini format.
+ * The argument to this option must be a filename.
+ * On parsing the specified file is read to set known option variables.
+ * It does not have an associated option variable itself.
+ * \param [in,out] opt       A valid options structure.
+ * \param [in] opt_char      Short option character, may be '\0'.
+ * \param [in] opt_name      Long option name without initial dashes, may be NULL.
+ * \param [in] help_string   Help string for usage message, may be NULL.
  */
 void                sc_options_add_inifile (sc_options_t * opt,
                                             int opt_char,
                                             const char *opt_name,
                                             const char *help_string);
 
-/**
- * Add an option that calls a user-defined function.
- * The callback function should be implemented to allow multiple calls
- * where the last call determines the effect independent of previous ones.
- * \param [in] has_arg  Specify if the option needs an option argument.
- * \param [in] fn       Function to call when this option is encountered.
- * \param [in] data     User-defined data passed to the callback.
+/** Add an option that calls a user-defined function when parsed.
+ * The callback function should be implemented to allow multiple calls.
+ * The option does not have an associated variable.
+ * The callback can be used to set multiple option variables in bulk that would
+ * otherwise require an inconvenient number of individual options.
+ * This is, however, currently not possible for options with
+ * string values or key-value pairs due to the way the API is set up.
+ * This function should not have non-option related side effects.
+ * This option is not loaded from or saved to files.
+ * \param [in,out] opt      A valid options structure.
+ * \param [in] opt_char     Short option character, may be '\0'.
+ * \param [in] opt_name     Long option name without initial dashes, may be NULL.
+ * \param [in] has_arg      Specify if the option needs an option argument.
+ * \param [in] fn           Function to call when this option is encountered.
+ * \param [in] data         User-defined data passed to the callback.
+ * \param [in] help_string  Help string for usage message, may be NULL.
  */
 void                sc_options_add_callback (sc_options_t * opt,
                                              int opt_char,
@@ -183,8 +215,31 @@ void                sc_options_add_callback (sc_options_t * opt,
                                              void *data,
                                              const char *help_string);
 
-/**
- * Copy one set of options to another as a subset, with a prefix.
+/** Add an option that takes string keys into a lookup table of integers.
+ * On calling this function, it must be certain that the initial value exists.
+ * \param [in] opt          Initialized options structure.
+ * \param [in] opt_char     Option character for command line, or 0.
+ * \param [in] opt_name     Name of the long option, or NULL.
+ * \param [in] variable     Address of an existing integer that holds
+ *                          the value of this option parameter.
+ * \param [in] init_value   The key that is looked up for the initial value.
+ *                          It must be certain that the key exists
+ *                          and its value is of type integer.
+ * \param [in] keyvalue     A valid key-value structure where the values
+ *                          must be integers.  If a key is asked for that
+ *                          does not exist, we will produce an option error.
+ *                          This structure must stay alive as long as opt.
+ * \param [in] help_string  Instructive one-line string to explain the option.
+ */
+void                sc_options_add_keyvalue (sc_options_t * opt,
+                                             int opt_char,
+                                             const char *opt_name,
+                                             int *variable,
+                                             const char *init_value,
+                                             sc_keyvalue_t * keyvalue,
+                                             const char *help_string);
+
+/** Copy one set of options to another as a subset, with a prefix.
  * \param [in,out] opt  A set of options.
  * \param [in]  subopt  Another set of options to be copied.
  * \param [in]  prefix  The prefix to add to option names as they are copied.
@@ -197,15 +252,14 @@ void                sc_options_add_suboptions (sc_options_t * opt,
                                                sc_options_t * subopt,
                                                const char *prefix);
 
-/**
- * Print a usage message.
+/** Print a usage message.
  * This function uses the SC_LC_GLOBAL log category.
  * That means the default action is to print only on rank 0.
  * Applications can change that by providing a user-defined log handler.
  * \param [in] package_id       Registered package id or -1.
  * \param [in] log_priority     Log priority for output according to sc.h.
  * \param [in] opt              The option structure.
- * \param [in] arg_usage        If not NULL, an <ARGUMENTS> string is appended
+ * \param [in] arg_usage        If not NULL, an \<ARGUMENTS\> string is appended
  *                              to the usage line.  If the string is non-empty,
  *                              it will be printed after the option summary
  *                              and an "ARGUMENTS:\n" title line.  Line breaks
@@ -215,8 +269,7 @@ void                sc_options_print_usage (int package_id, int log_priority,
                                             sc_options_t * opt,
                                             const char *arg_usage);
 
-/**
- * Print a summary of all option values.
+/** Print a summary of all option values.
  * Prints the title "Options:" and a line for every option,
  * then the title "Arguments:" and a line for every argument.
  * This function uses the SC_LC_GLOBAL log category.
@@ -230,9 +283,8 @@ void                sc_options_print_summary (int package_id,
                                               int log_priority,
                                               sc_options_t * opt);
 
-/**
- * Load a file in .ini format and updates entries found under [Options].  An
- * option whose name contains a colon such as "prefix:basename" will be
+/** Load a file in .ini format and updates entries found under [Options].
+ * An option whose name contains a colon such as "prefix:basename" will be
  * updated by a "basename =" entry in a [prefix] section.
  * \param [in] package_id       Registered package id or -1.
  * \param [in] err_priority     Error log priority according to sc.h.
@@ -243,8 +295,7 @@ void                sc_options_print_summary (int package_id,
 int                 sc_options_load (int package_id, int err_priority,
                                      sc_options_t * opt, const char *inifile);
 
-/**
- * Save all options and arguments to a file in .ini format.
+/** Save all options and arguments to a file in .ini format.
  * This function must only be called after successful option parsing.
  * This function should only be called on rank 0.
  * This function will log errors with category SC_LC_GLOBAL.
@@ -253,14 +304,13 @@ int                 sc_options_load (int package_id, int err_priority,
  * \param [in] package_id       Registered package id or -1.
  * \param [in] err_priority     Error log priority according to sc.h.
  * \param [in] opt              The option structure.
- * \param [in] filename         Filename of the ini file to save.
+ * \param [in] inifile          Filename of the ini file to save.
  * \return                      Returns 0 on success, -1 on failure.
  */
 int                 sc_options_save (int package_id, int err_priority,
                                      sc_options_t * opt, const char *inifile);
 
-/**
- * Parse command line options.
+/** Parse command line options.
  * \param [in] package_id       Registered package id or -1.
  * \param [in] err_priority     Error log priority according to sc.h.
  * \param [in] opt              The option structure.
@@ -273,8 +323,7 @@ int                 sc_options_parse (int package_id, int err_priority,
                                       sc_options_t * opt, int argc,
                                       char **argv);
 
-/**
- * Load a file in .ini format and updates entries found under [Arguments].
+/** Load a file in .ini format and updates entries found under [Arguments].
  * There needs to be a key Arguments.count specifing the number.
  * Then as many integer keys starting with 0 need to be present.
  * \param [in] package_id       Registered package id or -1.
diff --git a/sc/src/sc_sort.h b/sc/src/sc_private.h
similarity index 54%
copy from sc/src/sc_sort.h
copy to sc/src/sc_private.h
index dd89a85..de9bdcf 100644
--- a/sc/src/sc_sort.h
+++ b/sc/src/sc_private.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -20,26 +21,29 @@
   02110-1301, USA.
 */
 
-#ifndef SC_SORT_H
-#define SC_SORT_H
+/** \file sc_private.h
+ *
+ * Support for calls between different parts of the sc library.
+ * This is not meant for use from other packages or applications.
+ */
+
+#ifndef SC_PRIVATE_H
+#define SC_PRIVATE_H
 
 #include <sc.h>
 
 SC_EXTERN_C_BEGIN;
 
-/** Sort a distributed set of values in parallel.
- * This algorithm uses bitonic sort between processors and qsort locally.
- * The partition of the data can be arbitrary and is not changed.
- * \param [in] mpicomm          Communicator to use.
- * \param [in] base             Pointer to the local subset of data.
- * \param [in] nmemb            Array of mpisize counts of local data.
- * \param [in] size             Size in bytes of each data value.
- * \param [in] compar           Comparison function to use.
+/** Add to the per-package variable about active reference counters.
+ * This function is thread safe; it uses per-package locking internally
+ * \param [in] package_id       The id of a registered package or -1.
+ * \param [in] toadd            This is added to the package's internal
+ *                              variable that counts active rcs.
+ *                              We assert that the new count does not
+ *                              drop below zero.
  */
-void                sc_psort (sc_MPI_Comm mpicomm, void *base,
-                              size_t * nmemb, size_t size,
-                              int (*compar) (const void *, const void *));
+void                sc_package_rc_count_add (int package_id, int toadd);
 
 SC_EXTERN_C_END;
 
-#endif /* SC_SORT_H */
+#endif /* SC_PRIVATE_H */
diff --git a/sc/src/sc_ranges.c b/sc/src/sc_ranges.c
index 916524a..b4dbcb8 100644
--- a/sc/src/sc_ranges.c
+++ b/sc/src/sc_ranges.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_ranges.h b/sc/src/sc_ranges.h
index 193cf1d..085a3cf 100644
--- a/sc/src/sc_ranges.h
+++ b/sc/src/sc_ranges.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_reduce.c b/sc/src/sc_reduce.c
index a4b1c23..24a1256 100644
--- a/sc/src/sc_reduce.c
+++ b/sc/src/sc_reduce.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_reduce.h b/sc/src/sc_reduce.h
index d1363f1..22b94c2 100644
--- a/sc/src/sc_reduce.h
+++ b/sc/src/sc_reduce.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_refcount.c b/sc/src/sc_refcount.c
new file mode 100644
index 0000000..6616d31
--- /dev/null
+++ b/sc/src/sc_refcount.c
@@ -0,0 +1,110 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_private.h>
+#include <sc_refcount.h>
+
+void
+sc_refcount_init_invalid (sc_refcount_t * rc)
+{
+  SC_ASSERT (rc != NULL);
+
+  rc->package_id = -1;
+  rc->refcount = -1;
+}
+
+void
+sc_refcount_init (sc_refcount_t * rc, int package_id)
+{
+  SC_ASSERT (rc != NULL);
+  SC_ASSERT (package_id == -1 || sc_package_is_registered (package_id));
+
+  rc->package_id = package_id;
+  rc->refcount = 1;
+
+#ifdef SC_ENABLE_DEBUG
+  sc_package_rc_count_add (rc->package_id, 1);
+#endif
+}
+
+sc_refcount_t      *
+sc_refcount_new (int package_id)
+{
+  sc_refcount_t      *rc;
+
+  rc = SC_ALLOC (sc_refcount_t, 1);
+  sc_refcount_init (rc, package_id);
+
+  return rc;
+}
+
+void
+sc_refcount_destroy (sc_refcount_t * rc)
+{
+  SC_ASSERT (rc != NULL);
+  SC_ASSERT (rc->refcount == 0);
+
+  SC_FREE (rc);
+}
+
+void
+sc_refcount_ref (sc_refcount_t * rc)
+{
+  SC_ASSERT (rc != NULL);
+  SC_ASSERT (rc->refcount > 0);
+
+  ++rc->refcount;
+}
+
+int
+sc_refcount_unref (sc_refcount_t * rc)
+{
+  SC_ASSERT (rc != NULL);
+  SC_ASSERT (rc->refcount > 0);
+
+  if (--rc->refcount == 0) {
+#ifdef SC_ENABLE_DEBUG
+    sc_package_rc_count_add (rc->package_id, -1);
+#endif
+    return 1;
+  }
+  else {
+    return 0;
+  }
+}
+
+int
+sc_refcount_is_active (const sc_refcount_t * rc)
+{
+  SC_ASSERT (rc != NULL);
+
+  return rc->refcount > 0;
+}
+
+int
+sc_refcount_is_last (const sc_refcount_t * rc)
+{
+  SC_ASSERT (rc != NULL);
+
+  return rc->refcount == 1;
+}
diff --git a/sc/src/sc_refcount.h b/sc/src/sc_refcount.h
new file mode 100644
index 0000000..a289b18
--- /dev/null
+++ b/sc/src/sc_refcount.h
@@ -0,0 +1,120 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+/** \file sc_refcount.h
+ *
+ * Provide reference counting facilities.
+ * The functions in this file can be used for multiple purposes.
+ * The current setup is not so much targeted at garbage collection but rather
+ * intended for debugging and verification.
+ */
+
+#ifndef SC_REFCOUNT_H
+#define SC_REFCOUNT_H
+
+#include <sc.h>
+
+SC_EXTERN_C_BEGIN;
+
+/** The refcount structure is declared in public so its size is known.
+ * Its members should really never be accessed directly.
+ */
+typedef struct sc_refcount
+{
+  /** The sc package that uses this reference counter. */
+  int                 package_id;
+
+  /** The reference count is always positive for a valid counter. */
+  int                 refcount;
+}
+sc_refcount_t;
+
+/** Initialize a well-defined but unusable reference counter.
+ * Specifically, we set its package identifier and reference count to -1.
+ * To make this reference counter usable, call \ref sc_refcount_init.
+ * \param [out] rc          This reference counter is defined as invalid.
+ *                          It will return false on both
+ *                          \ref sc_refcount_is_active and
+ *                          \ref sc_refcount_is_last.
+ *                          It can be made valid by calling
+ *                          \ref sc_refcount_init.
+ *                          No other functions must be called on it.
+ */
+void                sc_refcount_init_invalid (sc_refcount_t * rc);
+
+/** Initialize a reference counter to 1.
+ * It is legal if its status prior to this call is undefined.
+ * \param [out] rc          This reference counter is initialized to one.
+ *                          The object's contents may be undefined on input.
+ * \param [in] package_id   Either -1 or a package registered to libsc.
+ */
+void                sc_refcount_init (sc_refcount_t * rc, int package_id);
+
+/** Create a new reference counter with count initialized to 1.
+ * Equivalent to calling \ref sc_refcount_init on a newly allocated rc object.
+ * \param [in] package_id   Either -1 or a package registered to libsc.
+ * \return                  A reference counter with count one.
+ */
+sc_refcount_t      *sc_refcount_new (int package_id);
+
+/** Destroy a reference counter.
+ * It must have been counted down to zero before, thus reached an inactive state.
+ * \param [in,out] rc       This reference counter must have reached count zero.
+ */
+void                sc_refcount_destroy (sc_refcount_t * rc);
+
+/** Increase a reference counter.
+ * The counter must be active, that is, have a value greater than zero.
+ * \param [in,out] rc       This reference counter must be valid (greater zero).
+ *                          Its count is increased by one.
+ */
+void                sc_refcount_ref (sc_refcount_t * rc);
+
+/** Decrease the reference counter and notify when it reaches zero.
+ * The count must be greater zero on input.  If the reference count reaches
+ * zero, which is indicated by the return value, the counter may not be used
+ * furter with \ref sc_refcount_ref or \see sc_refcount_unref.  It is legal,
+ * however, to reactivate it later by calling \see sc_refcount_init.
+ * \param [in,out] rc       This reference counter must be valid (greater zero).
+ *                          Its count is decreased by one.
+ * \return          True if the count has reached zero, false otherwise.
+ */
+int                 sc_refcount_unref (sc_refcount_t * rc);
+
+/** Check whether a reference counter has a positive value.
+ * This means that the reference counter is in use and corresponds to a live object.
+ * \param [in] rc   A reference counter.
+ * \return          True if the count is greater zero, false otherwise.
+ */
+int                 sc_refcount_is_active (const sc_refcount_t * rc);
+
+/** Check whether a reference counter has value one.
+ * This means that this counter is the last of its kind, which we may optimize for.
+ * \param [in] rc   A reference counter.
+ * \return          True if the count is exactly one.
+ */
+int                 sc_refcount_is_last (const sc_refcount_t * rc);
+
+SC_EXTERN_C_END;
+
+#endif /* !SC_REFCOUNT_H */
diff --git a/sc/src/sc_search.c b/sc/src/sc_search.c
index 2197286..d46e33b 100644
--- a/sc/src/sc_search.c
+++ b/sc/src/sc_search.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -84,7 +85,7 @@ sc_search_lower_bound64 (int64_t target, const int64_t * array,
     break;
   }
 
-  SC_ASSERT (0 <= guess && guess < nmemb);
+  SC_ASSERT (guess < nmemb);
   SC_ASSERT (array[guess] >= target);
   SC_ASSERT (guess == 0 || array[guess - 1] < target);
   return (ssize_t) guess;
diff --git a/sc/src/sc_search.h b/sc/src/sc_search.h
index b0a5834..451a889 100644
--- a/sc/src/sc_search.h
+++ b/sc/src/sc_search.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_shmem.c b/sc/src/sc_shmem.c
new file mode 100644
index 0000000..49bffa4
--- /dev/null
+++ b/sc/src/sc_shmem.c
@@ -0,0 +1,928 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_shmem.h>
+
+#if defined(__bgq__)
+/** for sc_allgather_final_*_bgq routines to work on BG/Q, you must
+ * run with --env BG_MAPCOMMONHEAP=1 */
+#include <hwi/include/bqc/A2_inlines.h>
+#endif
+
+#if defined(SC_ENABLE_MPI)
+static int          sc_shmem_keyval = MPI_KEYVAL_INVALID;
+#endif
+
+const char         *sc_shmem_type_to_string[SC_SHMEM_NUM_TYPES] = {
+  "basic", "basic_prescan",
+#if defined(SC_ENABLE_MPIWINSHARED)
+  "window", "window_prescan",
+#endif
+#if defined(__bgq__)
+  "bgq", "bgq_prescan",
+#endif
+};
+
+static void
+sc_scan_on_array (void *recvchar, int size, int count, int typesize,
+                  sc_MPI_Datatype type, sc_MPI_Op op)
+{
+  int                 p, c;
+
+  if (op == sc_MPI_SUM) {
+    if (type == sc_MPI_CHAR) {
+      char               *array = (char *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_SHORT) {
+      short              *array = (short *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_UNSIGNED_SHORT) {
+      unsigned short     *array = (unsigned short *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_INT) {
+      int                *array = (int *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_UNSIGNED) {
+      unsigned           *array = (unsigned *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_LONG) {
+      long               *array = (long *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_UNSIGNED_LONG) {
+      unsigned long      *array = (unsigned long *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_LONG_LONG_INT) {
+      long long          *array = (long long *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_FLOAT) {
+      float              *array = (float *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_DOUBLE) {
+      double             *array = (double *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else if (type == sc_MPI_LONG_DOUBLE) {
+      long double        *array = (long double *) recvchar;
+
+      SC_ASSERT (sizeof (*array) == typesize);
+      for (p = 1; p <= size; p++) {
+        for (c = 0; c < count; c++) {
+          array[count * p + c] += array[count * (p - 1) + c];
+        }
+      }
+    }
+    else {
+      SC_ABORT ("MPI_Datatype not supported\n");
+    }
+  }
+  else {
+    SC_ABORT ("MPI_Op not supported\n");
+  }
+}
+
+#if !defined(SC_SHMEM_DEFAULT)
+#define SC_SHMEM_DEFAULT SC_SHMEM_BASIC
+#endif
+sc_shmem_type_t     sc_shmem_default_type = SC_SHMEM_DEFAULT;
+
+#ifdef SC_ENABLE_MPI
+
+static sc_shmem_type_t sc_shmem_types[SC_SHMEM_NUM_TYPES] = {
+  SC_SHMEM_BASIC,
+  SC_SHMEM_PRESCAN,
+#if defined(SC_ENABLE_MPIWINSHARED)
+  SC_SHMEM_WINDOW,
+  SC_SHMEM_WINDOW_PRESCAN
+#endif
+#if defined(__bgq__)
+    SC_SHMEM_BGQ,
+  SC_SHMEM_BGQ_PRESCAN,
+#endif
+};
+
+#endif /* SC_ENABLE_MPI */
+
+sc_shmem_type_t
+sc_shmem_get_type (sc_MPI_Comm comm)
+{
+#if defined(SC_ENABLE_MPI)
+  int                 mpiret, flg;
+  sc_shmem_type_t    *type;
+
+  if (sc_shmem_keyval == MPI_KEYVAL_INVALID) {
+    mpiret =
+      MPI_Comm_create_keyval (MPI_COMM_DUP_FN, MPI_COMM_NULL_DELETE_FN,
+                              &sc_shmem_keyval, NULL);
+    SC_CHECK_MPI (mpiret);
+  }
+  SC_ASSERT (sc_shmem_keyval != MPI_KEYVAL_INVALID);
+
+  mpiret = MPI_Comm_get_attr (comm, sc_shmem_keyval, &type, &flg);
+  SC_CHECK_MPI (mpiret);
+
+  if (flg) {
+    return *type;
+  }
+  else {
+    return SC_SHMEM_NOT_SET;
+  }
+#else
+  return SC_SHMEM_BASIC;
+#endif
+}
+
+void
+sc_shmem_set_type (sc_MPI_Comm comm, sc_shmem_type_t type)
+{
+#if defined(SC_ENABLE_MPI)
+  int                 mpiret;
+
+  if (sc_shmem_keyval == MPI_KEYVAL_INVALID) {
+    mpiret =
+      MPI_Comm_create_keyval (MPI_COMM_DUP_FN, MPI_COMM_NULL_DELETE_FN,
+                              &sc_shmem_keyval, NULL);
+    SC_CHECK_MPI (mpiret);
+  }
+  SC_ASSERT (sc_shmem_keyval != MPI_KEYVAL_INVALID);
+
+  mpiret = MPI_Comm_set_attr (comm, sc_shmem_keyval, &sc_shmem_types[type]);
+  SC_CHECK_MPI (mpiret);
+#endif
+}
+
+static              sc_shmem_type_t
+sc_shmem_get_type_default (sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type = sc_shmem_get_type (comm);
+  if (type == SC_SHMEM_NOT_SET) {
+    type = sc_shmem_default_type;
+    sc_shmem_set_type (comm, type);
+  }
+  return type;
+}
+
+/* BASIC implementation */
+static void        *
+sc_shmem_malloc_basic (int package, size_t elem_count, size_t elem_size,
+                       sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                       sc_MPI_Comm internode)
+{
+  return sc_malloc (package, elem_count * elem_size);
+}
+
+static void
+sc_shmem_free_basic (int package, void *array, sc_MPI_Comm comm,
+                     sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  sc_free (package, array);
+}
+
+static int
+sc_shmem_write_start_basic (void *array, sc_MPI_Comm comm,
+                            sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  return 1;
+}
+
+static void
+sc_shmem_write_end_basic (void *array, sc_MPI_Comm comm,
+                          sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+}
+
+static void
+sc_shmem_memcpy_basic (void *destarray, void *srcarray, size_t bytes,
+                       sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                       sc_MPI_Comm internode)
+{
+  memcpy (destarray, srcarray, bytes);
+}
+
+static void
+sc_shmem_allgather_basic (void *sendbuf, int sendcount,
+                          sc_MPI_Datatype sendtype, void *recvbuf,
+                          int recvcount, sc_MPI_Datatype recvtype,
+                          sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                          sc_MPI_Comm internode)
+{
+  int                 mpiret = sc_MPI_Allgather (sendbuf, sendcount, sendtype,
+                                                 recvbuf, recvcount, recvtype,
+                                                 comm);
+  SC_CHECK_MPI (mpiret);
+}
+
+static void
+sc_shmem_prefix_basic (void *sendbuf, void *recvbuf, int count,
+                       sc_MPI_Datatype type, sc_MPI_Op op,
+                       sc_MPI_Comm comm,
+                       sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  int                 mpiret, size;
+  size_t              typesize = sc_mpi_sizeof (type);
+
+  memset (recvbuf, 0, typesize * count);
+  mpiret = sc_MPI_Allgather (sendbuf, count, type, (char *) recvbuf +
+                             typesize * count, count, type, comm);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (comm, &size);
+  SC_CHECK_MPI (mpiret);
+  sc_scan_on_array (recvbuf, size, count, typesize, type, op);
+}
+
+/* PRESCAN implementation */
+
+static void
+sc_shmem_prefix_prescan (void *sendbuf, void *recvbuf, int count,
+                         sc_MPI_Datatype type, sc_MPI_Op op,
+                         sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                         sc_MPI_Comm internode)
+{
+  int                 mpiret;
+  size_t              typesize = sc_mpi_sizeof (type);
+  char               *sendscan;
+
+  sendscan = SC_ALLOC (char, typesize * count);
+  mpiret = sc_MPI_Scan (sendbuf, sendscan, count, type, op, comm);
+  SC_CHECK_MPI (mpiret);
+
+  memset (recvbuf, 0, typesize * count);
+  mpiret = sc_MPI_Allgather (sendscan, count, type,
+                             (char *) recvbuf + typesize * count,
+                             count, type, comm);
+  SC_CHECK_MPI (mpiret);
+  SC_FREE (sendscan);
+}
+
+/* common to SHARED and WINDOW */
+
+#if defined(__bgq__) || defined(SC_ENABLE_MPIWINSHARED)
+
+static void
+sc_shmem_memcpy_common (void *destarray, void *srcarray, size_t bytes,
+                        sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                        sc_MPI_Comm internode)
+{
+  if (sc_shmem_write_start (destarray, comm)) {
+    memcpy (destarray, srcarray, bytes);
+  }
+  sc_shmem_write_end (destarray, comm);
+}
+
+static void
+sc_shmem_allgather_common (void *sendbuf, int sendcount,
+                           sc_MPI_Datatype sendtype, void *recvbuf,
+                           int recvcount, sc_MPI_Datatype recvtype,
+                           sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                           sc_MPI_Comm internode)
+{
+  size_t              typesize;
+  int                 mpiret, intrarank, intrasize;
+  char               *noderecvchar = NULL;
+
+  typesize = sc_mpi_sizeof (recvtype);
+
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (intranode, &intrasize);
+  SC_CHECK_MPI (mpiret);
+
+  /* node root gathers from node */
+  if (!intrarank) {
+    noderecvchar = SC_ALLOC (char, intrasize * recvcount * typesize);
+  }
+  mpiret =
+    sc_MPI_Gather (sendbuf, sendcount, sendtype, noderecvchar, recvcount,
+                   recvtype, 0, intranode);
+  SC_CHECK_MPI (mpiret);
+
+  /* node root allgathers between nodes */
+  if (sc_shmem_write_start (recvbuf, comm)) {
+    mpiret =
+      sc_MPI_Allgather (noderecvchar, sendcount * intrasize, sendtype,
+                        recvbuf, recvcount * intrasize, recvtype, internode);
+    SC_CHECK_MPI (mpiret);
+    SC_FREE (noderecvchar);
+  }
+  sc_shmem_write_end (recvbuf, comm);
+}
+
+static void
+sc_shmem_prefix_common (void *sendbuf, void *recvbuf, int count,
+                        sc_MPI_Datatype type, sc_MPI_Op op,
+                        sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                        sc_MPI_Comm internode)
+{
+  size_t              typesize;
+  int                 mpiret, intrarank, intrasize, size;
+  char               *noderecvchar = NULL;
+
+  typesize = sc_mpi_sizeof (type);
+
+  mpiret = sc_MPI_Comm_size (comm, &size);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (intranode, &intrasize);
+  SC_CHECK_MPI (mpiret);
+
+  /* node root gathers from node */
+  if (!intrarank) {
+    noderecvchar = SC_ALLOC (char, intrasize * count * typesize);
+  }
+  mpiret =
+    sc_MPI_Gather (sendbuf, count, type, noderecvchar, count, type, 0,
+                   intranode);
+  SC_CHECK_MPI (mpiret);
+
+  /* node root allgathers between nodes */
+  if (sc_shmem_write_start (recvbuf, comm)) {
+    memset (recvbuf, 0, count * typesize);
+    mpiret =
+      sc_MPI_Allgather (noderecvchar, count * intrasize, type,
+                        (char *) recvbuf + count * typesize,
+                        count * intrasize, type, internode);
+    SC_CHECK_MPI (mpiret);
+    SC_FREE (noderecvchar);
+    sc_scan_on_array (recvbuf, size, count, typesize, type, op);
+  }
+  sc_shmem_write_end (recvbuf, comm);
+}
+
+static void
+sc_shmem_prefix_common_prescan (void *sendbuf, void *recvbuf, int count,
+                                sc_MPI_Datatype type, sc_MPI_Op op,
+                                sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                                sc_MPI_Comm internode)
+{
+  size_t              typesize;
+  int                 mpiret, intrarank, intrasize, size;
+  char               *sendscan = NULL;
+  char               *noderecvchar = NULL;
+
+  typesize = sc_mpi_sizeof (type);
+
+  sendscan = SC_ALLOC (char, typesize * count);
+  mpiret = sc_MPI_Scan (sendbuf, sendscan, count, type, op, comm);
+
+  mpiret = sc_MPI_Comm_size (comm, &size);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (intranode, &intrasize);
+  SC_CHECK_MPI (mpiret);
+
+  /* node root gathers from node */
+  if (!intrarank) {
+    noderecvchar = SC_ALLOC (char, intrasize * count * typesize);
+  }
+  mpiret =
+    sc_MPI_Gather (sendscan, count, type, noderecvchar, count, type, 0,
+                   intranode);
+  SC_CHECK_MPI (mpiret);
+  SC_FREE (sendscan);
+
+  /* node root allgathers between nodes */
+  if (sc_shmem_write_start (recvbuf, comm)) {
+    memset (recvbuf, 0, count * typesize);
+    mpiret =
+      sc_MPI_Allgather (noderecvchar, count * intrasize, type,
+                        (char *) recvbuf + count * typesize,
+                        count * intrasize, type, internode);
+    SC_CHECK_MPI (mpiret);
+    SC_FREE (noderecvchar);
+  }
+  sc_shmem_write_end (recvbuf, comm);
+}
+
+#endif /* defined(__bgq__) || defined(SC_ENABLE_MPIWINSHARED) */
+
+#if defined(__bgq__)
+/* SHARED implementation */
+
+static int
+sc_shmem_write_start_bgq (void *array, sc_MPI_Comm comm,
+                          sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  int                 intrarank, mpiret;
+
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+
+  return !intrarank;
+}
+
+static void
+sc_shmem_write_end_bgq (void *array, sc_MPI_Comm comm,
+                        sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  int                 mpiret;
+
+  /* these memory sync's are included in Jeff's example */
+  /* https://wiki.alcf.anl.gov/parts/index.php/Blue_Gene/Q#Abusing_the_common_heap */
+  ppc_msync ();
+  mpiret = sc_MPI_Barrier (intranode);
+  SC_CHECK_MPI (mpiret);
+}
+
+static void        *
+sc_shmem_malloc_bgq (int package, size_t elem_size, size_t elem_count,
+                     sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                     sc_MPI_Comm internode)
+{
+  char               *array = NULL;
+  int                 mpiret;
+
+  if (sc_shmem_write_start_bgq (NULL, comm, intranode, internode)) {
+    array = sc_malloc (package, elem_size * elem_count);
+
+  }
+  /* these memory sync's are included in Jeff's example */
+  /* https://wiki.alcf.anl.gov/parts/index.php/Blue_Gene/Q#Abusing_the_common_heap */
+  ppc_msync ();
+
+  /* node root broadcast array start in node */
+  mpiret = sc_MPI_Bcast (&array, sizeof (char *), sc_MPI_BYTE, 0, intranode);
+  SC_CHECK_MPI (mpiret);
+
+  /* these memory sync's are included in Jeff's example */
+  /* https://wiki.alcf.anl.gov/parts/index.php/Blue_Gene/Q#Abusing_the_common_heap */
+  ppc_msync ();
+
+  return (void *) array;
+}
+
+static void
+sc_shmem_free_bgq (int package, void *array, sc_MPI_Comm comm,
+                   sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  if (sc_shmem_write_start_bgq (NULL, comm, intranode, internode)) {
+    sc_free (package, array);
+  }
+  sc_shmem_write_end_bgq (NULL, comm, intranode, internode);
+}
+#endif /* __bgq__ */
+
+#if defined(SC_ENABLE_MPIWINSHARED)
+/* MPI_Win implementation */
+
+static              MPI_Win
+sc_shmem_get_win (void *array, sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                  sc_MPI_Comm internode)
+{
+  int                 mpiret, intrarank, intrasize;
+
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (intranode, &intrasize);
+  SC_CHECK_MPI (mpiret);
+  return ((MPI_Win *) array)[-intrasize + intrarank];
+}
+
+static void        *
+sc_shmem_malloc_window (int package, size_t elem_size, size_t elem_count,
+                        sc_MPI_Comm comm, sc_MPI_Comm intranode,
+                        sc_MPI_Comm internode)
+{
+  char               *array = NULL;
+  int                 mpiret, disp_unit, intrarank, intrasize;
+  MPI_Win             win;
+  MPI_Aint            winsize = 0;
+
+  disp_unit = SC_MAX (elem_size, sizeof (MPI_Win));
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (intranode, &intrasize);
+  SC_CHECK_MPI (mpiret);
+  if (!intrarank) {
+    winsize = elem_size * elem_count + intrasize * sizeof (MPI_Win);
+    if (winsize % disp_unit) {
+      winsize = ((winsize / disp_unit) + 1) * disp_unit;
+    }
+  }
+  mpiret = MPI_Win_allocate_shared (winsize, disp_unit, MPI_INFO_NULL,
+                                    intranode, &array, &win);
+  SC_CHECK_MPI (mpiret);
+  mpiret = MPI_Win_shared_query (win, 0, &winsize, &disp_unit, &array);
+  SC_CHECK_MPI (mpiret);
+  /* store the windows at the front of the array */
+  mpiret = sc_MPI_Gather (&win, sizeof (MPI_Win), sc_MPI_BYTE,
+                          array, sizeof (MPI_Win), sc_MPI_BYTE, 0, intranode);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Barrier (intranode);
+  SC_CHECK_MPI (mpiret);
+
+  mpiret = MPI_Win_lock (MPI_LOCK_SHARED, 0, MPI_MODE_NOCHECK, win);
+  SC_CHECK_MPI (mpiret);
+
+  return ((MPI_Win *) array) + intrasize;
+}
+
+static void
+sc_shmem_free_window (int package, void *array, sc_MPI_Comm comm,
+                      sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  int                 mpiret;
+  MPI_Win             win;
+
+  win = sc_shmem_get_win (array, comm, intranode, internode);
+
+  mpiret = MPI_Win_unlock (0, win);
+  SC_CHECK_MPI (mpiret);
+  mpiret = MPI_Win_free (&win);
+  SC_CHECK_MPI (mpiret);
+}
+
+static int
+sc_shmem_write_start_window (void *array, sc_MPI_Comm comm,
+                             sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  int                 mpiret, intrarank;
+  MPI_Win             win;
+
+  win = sc_shmem_get_win (array, comm, intranode, internode);
+
+  mpiret = MPI_Win_unlock (0, win);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  if (!intrarank) {
+    mpiret = MPI_Win_lock (MPI_LOCK_EXCLUSIVE, 0, MPI_MODE_NOCHECK, win);
+    SC_CHECK_MPI (mpiret);
+
+    return 1;
+  }
+  return 0;
+}
+
+static void
+sc_shmem_write_end_window (void *array, sc_MPI_Comm comm,
+                           sc_MPI_Comm intranode, sc_MPI_Comm internode)
+{
+  int                 mpiret, intrarank;
+  MPI_Win             win;
+
+  win = sc_shmem_get_win (array, comm, intranode, internode);
+
+  mpiret = sc_MPI_Comm_rank (intranode, &intrarank);
+  SC_CHECK_MPI (mpiret);
+  if (!intrarank) {
+    mpiret = MPI_Win_unlock (0, win);
+    SC_CHECK_MPI (mpiret);
+  }
+  mpiret = sc_MPI_Barrier (intranode);
+  SC_CHECK_MPI (mpiret);
+  mpiret = MPI_Win_lock (MPI_LOCK_SHARED, 0, MPI_MODE_NOCHECK, win);
+  SC_CHECK_MPI (mpiret);
+}
+#endif /* SC_ENABLE_MPIWINSHARED */
+
+void               *
+sc_shmem_malloc (int package, size_t elem_size, size_t elem_count,
+                 sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+  case SC_SHMEM_PRESCAN:
+    return sc_shmem_malloc_basic (package, elem_size, elem_count, comm,
+                                  intranode, internode);
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+  case SC_SHMEM_BGQ_PRESCAN:
+    return sc_shmem_malloc_bgq (package, elem_size, elem_count, comm,
+                                intranode, internode);
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+  case SC_SHMEM_WINDOW_PRESCAN:
+    return sc_shmem_malloc_window (package, elem_size, elem_count, comm,
+                                   intranode, internode);
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+  return NULL;
+}
+
+void
+sc_shmem_free (int package, void *array, sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+  case SC_SHMEM_PRESCAN:
+    sc_shmem_free_basic (package, array, comm, intranode, internode);
+    break;
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+  case SC_SHMEM_BGQ_PRESCAN:
+    sc_shmem_free_bgq (package, array, comm, intranode, internode);
+    break;
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+  case SC_SHMEM_WINDOW_PRESCAN:
+    sc_shmem_free_window (package, array, comm, intranode, internode);
+    break;
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+}
+
+int
+sc_shmem_write_start (void *array, sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+  case SC_SHMEM_PRESCAN:
+    return sc_shmem_write_start_basic (array, comm, intranode, internode);
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+  case SC_SHMEM_BGQ_PRESCAN:
+    return sc_shmem_write_start_bgq (array, comm, intranode, internode);
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+  case SC_SHMEM_WINDOW_PRESCAN:
+    return sc_shmem_write_start_window (array, comm, intranode, internode);
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+  return 0;
+}
+
+void
+sc_shmem_write_end (void *array, sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+  case SC_SHMEM_PRESCAN:
+    sc_shmem_write_end_basic (array, comm, intranode, internode);
+    break;
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+  case SC_SHMEM_BGQ_PRESCAN:
+    sc_shmem_write_end_bgq (array, comm, intranode, internode);
+    break;
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+  case SC_SHMEM_WINDOW_PRESCAN:
+    sc_shmem_write_end_window (array, comm, intranode, internode);
+    break;
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+}
+
+void
+sc_shmem_memcpy (void *destarray, void *srcarray, size_t bytes,
+                 sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+  case SC_SHMEM_PRESCAN:
+    sc_shmem_memcpy_basic (destarray, srcarray, bytes, comm, intranode,
+                           internode);
+    break;
+#if defined(__bgq__) || defined(SC_ENABLE_MPIWINSHARED)
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+  case SC_SHMEM_BGQ_PRESCAN:
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+  case SC_SHMEM_WINDOW_PRESCAN:
+#endif
+    sc_shmem_memcpy_common (destarray, srcarray, bytes, comm, intranode,
+                            internode);
+    break;
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+}
+
+void
+sc_shmem_allgather (void *sendbuf, int sendcount,
+                    sc_MPI_Datatype sendtype, void *recvbuf,
+                    int recvcount, sc_MPI_Datatype recvtype, sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+  case SC_SHMEM_PRESCAN:
+    sc_shmem_allgather_basic (sendbuf, sendcount, sendtype, recvbuf,
+                              recvcount, recvtype, comm, intranode,
+                              internode);
+    break;
+#if defined(__bgq__) || defined(SC_ENABLE_MPIWINSHARED)
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+  case SC_SHMEM_BGQ_PRESCAN:
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+  case SC_SHMEM_WINDOW_PRESCAN:
+#endif
+    sc_shmem_allgather_common (sendbuf, sendcount, sendtype, recvbuf,
+                               recvcount, recvtype, comm, intranode,
+                               internode);
+    break;
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+}
+
+void
+sc_shmem_prefix (void *sendbuf, void *recvbuf, int count,
+                 sc_MPI_Datatype dtype, sc_MPI_Op op, sc_MPI_Comm comm)
+{
+  sc_shmem_type_t     type;
+  sc_MPI_Comm         intranode = sc_MPI_COMM_NULL, internode =
+    sc_MPI_COMM_NULL;
+
+  type = sc_shmem_get_type_default (comm);
+  sc_mpi_comm_get_node_comms (comm, &intranode, &internode);
+  if (intranode == sc_MPI_COMM_NULL || internode == sc_MPI_COMM_NULL) {
+    type = SC_SHMEM_BASIC;
+  }
+  switch (type) {
+  case SC_SHMEM_BASIC:
+    sc_shmem_prefix_basic (sendbuf, recvbuf, count, dtype, op, comm,
+                           intranode, internode);
+    break;
+  case SC_SHMEM_PRESCAN:
+    sc_shmem_prefix_prescan (sendbuf, recvbuf, count, dtype, op, comm,
+                             intranode, internode);
+    break;
+#if defined(__bgq__) || defined(SC_ENABLE_MPIWINSHARED)
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ:
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW:
+#endif
+    sc_shmem_prefix_common (sendbuf, recvbuf, count, dtype, op, comm,
+                            intranode, internode);
+    break;
+#endif
+#if defined(__bgq__) || defined(SC_ENABLE_MPIWINSHARED)
+#if defined(__bgq__)
+  case SC_SHMEM_BGQ_PRESCAN:
+#endif
+#if defined(SC_ENABLE_MPIWINSHARED)
+  case SC_SHMEM_WINDOW_PRESCAN:
+#endif
+    sc_shmem_prefix_common_prescan (sendbuf, recvbuf, count, dtype, op,
+                                    comm, intranode, internode);
+    break;
+#endif
+  default:
+    SC_ABORT_NOT_REACHED ();
+  }
+}
diff --git a/sc/src/sc_shmem.h b/sc/src/sc_shmem.h
new file mode 100644
index 0000000..fbc578a
--- /dev/null
+++ b/sc/src/sc_shmem.h
@@ -0,0 +1,169 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#ifndef SC_SHMEM_H
+#define SC_SHMEM_H
+
+#include <sc.h>
+#include <sc_mpi.h>
+
+SC_EXTERN_C_BEGIN;
+
+/** \file sc_shmem.h */
+
+/* sc_shmem: an interface for arrays that are redundant on each mpi
+ * process */
+
+typedef enum
+{
+  SC_SHMEM_BASIC = 0,      /**< use allgathers, then sum to simulate scan */
+  SC_SHMEM_PRESCAN,        /**< mpi_scan, then allgather */
+#if defined(SC_ENABLE_MPIWINSHARED)
+  SC_SHMEM_WINDOW,         /**< MPI_Win (requires MPI 3) */
+  SC_SHMEM_WINDOW_PRESCAN, /**< mpi_scan, then MPI_Win (requires MPI 3) */
+#endif
+#if defined(__bgq__)
+  SC_SHMEM_BGQ,            /**< raw pointer passing: only works for
+                                shared-heap environments */
+  SC_SHMEM_BGQ_PRESCAN,    /**< mpi_scan, then raw pointer passing: only works
+                                for shared-heap environments */
+#endif
+  SC_SHMEM_NUM_TYPES,
+  SC_SHMEM_NOT_SET
+}
+sc_shmem_type_t;
+
+extern const char  *sc_shmem_type_to_string[SC_SHMEM_NUM_TYPES];
+
+extern sc_shmem_type_t sc_shmem_default_type;
+
+/* ALL sc_shmem routines should be considered collective: called on
+ * every process in the communicator */
+
+/** Set the type of shared memory arrays to use on this mpi communicator.
+ *
+ * \param[in,out] comm        the mpi communicator
+ * \param[in]     type        the type of shmem array behavior.
+ */
+void                sc_shmem_set_type (sc_MPI_Comm comm,
+                                       sc_shmem_type_t type);
+
+/** Get the type of shared memory arrays to use on this mpi communicator.
+ *
+ * \param[in] comm        the mpi communicator
+ *
+ * \return the type of shmem array used on this communicator.
+ */
+sc_shmem_type_t     sc_shmem_get_type (sc_MPI_Comm comm);
+
+/** Allocate a shmem array: an array that is redundant on every process.
+ *
+ * \param[in] package         package requesting memory
+ * \param[in] elem_size       the size of each element in the array
+ * \param[in] elem_count      the number of elements in the array
+ * \param[in] comm            the mpi communicator
+ *
+ * \return a shared memory array
+ * */
+void               *sc_shmem_malloc (int package, size_t elem_size,
+                                     size_t elem_count, sc_MPI_Comm comm);
+
+#define SC_SHMEM_ALLOC(t,e,c) (t *) sc_shmem_malloc(sc_package_id,sizeof(t),e,c)
+
+/** Destroy a shmem array created with sc_shmem_alloc()
+ *
+ * \param[in] package         package freeing memory
+ * \param[in] array           array to be freed
+ * \param[in] comm            the mpi communicator
+ *
+ * */
+void                sc_shmem_free (int package, void *array,
+                                   sc_MPI_Comm comm);
+
+#define SC_SHMEM_FREE(a,c) sc_shmem_free (sc_package_id,a,c)
+
+/** Start a write window for a shared array.
+ *
+ * \param[in] array           array that will be changed.
+ * \param[in] comm            the mpi communicator
+ *
+ * \return 1 if I have write access, 0 if my proc should not change the
+ * array.
+ */
+int                 sc_shmem_write_start (void *array, sc_MPI_Comm comm);
+
+/** End a write window for a shared array.
+ *
+ * \param[in] array           array that has changed
+ * \param[in] comm            the mpi communicator
+ */
+void                sc_shmem_write_end (void *array, sc_MPI_Comm comm);
+
+/** Copy a shmem array.
+ *
+ * \param[out]  destarray     array to write to
+ * \param[in]   srcarray      array to write from
+ * \param[in]   bytes         number of bytes to write
+ * \param[in]   comm          the mpi communicator
+ */
+void                sc_shmem_memcpy (void *destarray, void *srcarray,
+                                     size_t bytes, sc_MPI_Comm comm);
+
+/** Fill a shmem array with an allgather.
+ *
+ * \param[in] sendbuf         the source from this process
+ * \param[in] sendcount       the number of items to allgather
+ * \param[in] sendtype        the type of items to allgather
+ * \param[in,out] recvbuf     the destination shmem array
+ * \param[in] recvcount       the number of items to allgather
+ * \param[in] recvtype        the type of items to allgather
+ * \param[in] comm            the mpi communicator
+ */
+void                sc_shmem_allgather (void *sendbuf, int sendcount,
+                                        sc_MPI_Datatype sendtype,
+                                        void *recvbuf, int recvcount,
+                                        sc_MPI_Datatype recvtype,
+                                        sc_MPI_Comm comm);
+
+/** Fill a shmem array with an allgather of the prefix op over all processes.
+ *
+ * The return array will be
+ * (0, send0, send0 op send1, send0 op send1 op send2, ...)
+ *
+ * Note that the first entry of \a recvbuf will be set to 0 using memset: if
+ * this is not the desired value for the first entry of the array, the user
+ * can change it *after* calling sc_shmem_prefix.
+ *
+ * \param[in] sendbuf         the source from this process
+ * \param[in,out] recvbuf     the destination shmem array
+ * \param[in] count           the number of items to allgather
+ * \param[in] type            the type of items to allgather
+ * \param[in] op              the operation to prefix (e.g., sc_MPI_SUM)
+ * \param[in] comm            the mpi communicator
+ */
+void                sc_shmem_prefix (void *sendbuf, void *recvbuf,
+                                     int count, sc_MPI_Datatype type,
+                                     sc_MPI_Op op, sc_MPI_Comm comm);
+SC_EXTERN_C_END;
+
+#endif /* SC_SHMEM_H */
diff --git a/sc/src/sc_sort.c b/sc/src/sc_sort.c
index 1f13d32..2d3221f 100644
--- a/sc/src/sc_sort.c
+++ b/sc/src/sc_sort.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_sort.h b/sc/src/sc_sort.h
index dd89a85..f29bbca 100644
--- a/sc/src/sc_sort.h
+++ b/sc/src/sc_sort.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_statistics.c b/sc/src/sc_statistics.c
index 1786db7..39b8bf2 100644
--- a/sc/src/sc_statistics.c
+++ b/sc/src/sc_statistics.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -75,6 +76,7 @@ sc_stats_set1 (sc_statinfo_t * stats, double value, const char *variable)
   stats->sum_squares = value * value;
   stats->min = value;
   stats->max = value;
+  stats->average = 0.;
   stats->variable = variable;
 }
 
@@ -85,6 +87,7 @@ sc_stats_init (sc_statinfo_t * stats, const char *variable)
   stats->count = 0;
   stats->sum_values = stats->sum_squares = 0.;
   stats->min = stats->max = 0.;
+  stats->average = 0.;
   stats->variable = variable;
 }
 
@@ -381,6 +384,12 @@ sc_statistics_add_empty (sc_statistics_t * stats, const char *name)
   sc_keyvalue_set_int (stats->kv, name, i);
 }
 
+int
+sc_statistics_has (sc_statistics_t * stats, const char *name)
+{
+  return sc_keyvalue_exists (stats->kv, name);
+}
+
 void
 sc_statistics_accumulate (sc_statistics_t * stats, const char *name,
                           double value)
diff --git a/sc/src/sc_statistics.h b/sc/src/sc_statistics.h
index 8b76e41..a6b2a41 100644
--- a/sc/src/sc_statistics.h
+++ b/sc/src/sc_statistics.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -133,6 +134,9 @@ void                sc_statistics_add (sc_statistics_t * stats,
 void                sc_statistics_add_empty (sc_statistics_t * stats,
                                              const char *name);
 
+/** Returns true if the stats include a variable with the given name */
+int                 sc_statistics_has (sc_statistics_t * stats,
+                                       const char *name);
 /** Set the value of a statistics variable, see sc_stats_set1.
  * The variable must previously be added with sc_statistics_add.
  * This assumes count=1 as in the sc_stats_set1 function above.
diff --git a/sc/src/sc_string.c b/sc/src/sc_string.c
new file mode 100644
index 0000000..c2cdcca
--- /dev/null
+++ b/sc/src/sc_string.c
@@ -0,0 +1,102 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_string.h>
+
+void
+sc_string_init (sc_string_t * scs)
+{
+  SC_ASSERT (scs != NULL);
+
+  scs->printed = 0;
+  scs->buffer[0] = '\0';
+}
+
+int
+sc_string_putc (sc_string_t * scs, int c)
+{
+  return sc_string_putf (scs, "%c", c);
+}
+
+int
+sc_string_puts (sc_string_t * scs, const char *s)
+{
+  return sc_string_putf (scs, "%s", s);
+}
+
+int
+sc_string_putf (sc_string_t * scs, const char *fmt, ...)
+{
+  int                 result;
+  va_list             val;
+
+  SC_ASSERT (scs != NULL);
+  SC_ASSERT (0 <= scs->printed && scs->printed < SC_STRING_SIZE);
+
+  va_start (val, fmt);
+  result = sc_string_putv (scs, fmt, val);
+  va_end (val);
+
+  return result;
+}
+
+int
+sc_string_putv (sc_string_t * scs, const char *fmt, va_list ap)
+{
+  int                 remain, result;
+
+  SC_ASSERT (scs != NULL);
+  SC_ASSERT (0 <= scs->printed && scs->printed < SC_STRING_SIZE);
+
+  remain = SC_STRING_SIZE - scs->printed;
+  if (remain == 1) {
+    /* the string is full and we cannot append any more */
+    return -1;
+  }
+
+  /* we print and see how many characters fit */
+  result = vsnprintf (scs->buffer + scs->printed, remain, fmt, ap);
+  if (result < 0 || result >= remain) {
+    /* the string is full now and we cannot append any more */
+    scs->printed = SC_STRING_SIZE - 1;
+    return -1;
+  }
+  else {
+    /* everything we printed has been fitted into the string */
+    scs->printed += result;
+    SC_ASSERT (0 <= scs->printed && scs->printed < SC_STRING_SIZE);
+    return 0;
+  }
+}
+
+const char         *
+sc_string_get_content (sc_string_t * scs, int *length)
+{
+  SC_ASSERT (scs != NULL);
+  SC_ASSERT (0 <= scs->printed && scs->printed < SC_STRING_SIZE);
+
+  if (length != NULL) {
+    *length = scs->printed;
+  }
+  return scs->buffer;
+}
diff --git a/sc/src/sc_string.h b/sc/src/sc_string.h
new file mode 100644
index 0000000..262993d
--- /dev/null
+++ b/sc/src/sc_string.h
@@ -0,0 +1,107 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#ifndef SC_STRING_H
+#define SC_STRING_H
+
+#include <sc.h>
+
+/** \file sc_string.h
+ * This file declares a simple string object that can be appended to.
+ */
+
+/** This defines the maximum string storage including the trailing '\0'. */
+#define SC_STRING_SIZE 4088
+
+/** This is a simple opaque type for growing a string by printf-like commands.
+ * It can be declared on the stack to avoid malloc and free.
+ * This means that the length of the string is limited to \ref SC_STRING_SIZE - 1.
+ * The current string can be accessed by \ref sc_string_get_content.
+ * This is really an opaque object: its members shall not be accessed directly.
+ */
+typedef struct sc_string
+{
+  /* None of the member variables are public.
+   * We provide the declaration here in sc_string.h
+   * so the object can be declared on the stack.
+   */
+  int                 printed;                  /**< Opaque object: do not access. */
+  char                buffer[SC_STRING_SIZE];   /**< Opaque object: do not access. */
+}
+sc_string_t;
+
+/** Initialize to an empty string.
+ * This function can be used to reset a non-empty string to be empty again.
+ * \param [out] scs             After returning, a valid object
+ *                              containing the empty string.
+ */
+void                sc_string_init (sc_string_t * scs);
+
+/** Append a single character to the string buffer object.
+ * \param [in,out] scs          A valid string buffer object.
+ * \param [in] c                Converted to an unsigned char and appended.
+ * \return                      Zero if the character has been appended and
+ *                              a negative value when the input was truncated.
+ */
+int                 sc_string_putc (sc_string_t * scs, int c);
+
+/** Append a string to the string buffer object.
+ * \param [in,out] scs          A valid string buffer object.
+ * \param [in] s                This string is appended to the string buffer.
+ * \return                      Zero if the string has been appended and
+ *                              a negative value when the input was truncated.
+ */
+int                 sc_string_puts (sc_string_t * scs, const char *s);
+
+/** Append to the string object using a format string and arguments.
+ * The maximum length will not be exceeded.
+ * The string object will remain valid even on truncated input.
+ * \param [in,out] scs          Valid string object that is appended to.
+ * \param [in] fmt              Format string as used with printf and friends.
+ * \return                      Zero if everything has been appended and a
+ *                              negative value when the input was truncated.
+ */
+int                 sc_string_putf (sc_string_t * scs, const char *fmt, ...)
+  __attribute__ ((format (printf, 2, 3)));
+
+/** Append to the string object using a format string and a vararg pointer.
+ * The maximum length will not be exceeded.
+ * The string object will remain valid even on truncated input.
+ * \param [in,out] scs          Valid string object that is appended to.
+ * \param [in] fmt              Format string as used with printf and friends.
+ * \param [in,out] ap           Argument list pointer as defined in stdarg.h.
+ * \return                      Zero if everything has been appended and a
+ *                              negative value when the input was truncated.
+ */
+int                 sc_string_putv (sc_string_t * scs, const char *fmt,
+                                    va_list ap);
+
+/** Access content of the string buffer.
+ * \param [in] scs              Valid sc_string object.
+ * \param [in] length           If not NULL, assign length without trailing '\0'.
+ * \return                      Pointer to an internally allocated string, may
+ *                              not be used after \b scs goes out of scope.
+ */
+const char         *sc_string_get_content (sc_string_t * scs, int *length);
+
+#endif /* !SC_STRING_H */
diff --git a/sc/src/sc_unique_counter.c b/sc/src/sc_unique_counter.c
new file mode 100644
index 0000000..c6bf4fc
--- /dev/null
+++ b/sc/src/sc_unique_counter.c
@@ -0,0 +1,77 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_unique_counter.h>
+
+sc_unique_counter_t *
+sc_unique_counter_new (int start_value)
+{
+  sc_unique_counter_t *uc;
+
+  uc = SC_ALLOC (sc_unique_counter_t, 1);
+  uc->start_value = start_value;
+  uc->mempool = sc_mempool_new_zero_and_persist (sizeof (int));
+
+  return uc;
+}
+
+void
+sc_unique_counter_destroy (sc_unique_counter_t * uc)
+{
+  SC_ASSERT (uc->mempool->elem_count == 0);
+
+  sc_mempool_destroy (uc->mempool);
+  SC_FREE (uc);
+}
+
+size_t
+sc_unique_counter_memory_used (sc_unique_counter_t * uc)
+{
+  return sizeof (sc_unique_counter_t) + sc_mempool_memory_used (uc->mempool);
+}
+
+int                *
+sc_unique_counter_add (sc_unique_counter_t * uc)
+{
+  int                *counter;
+
+  counter = (int *) sc_mempool_alloc (uc->mempool);
+  if (!*counter) {
+    *counter = (int) uc->mempool->elem_count;
+  }
+  *counter += uc->start_value - 1;
+  SC_ASSERT (*counter >= uc->start_value);
+
+  return counter;
+}
+
+void
+sc_unique_counter_release (sc_unique_counter_t * uc, int *counter)
+{
+  SC_ASSERT (counter != NULL);
+  SC_ASSERT (*counter >= uc->start_value);
+
+  *counter -= uc->start_value - 1;
+
+  sc_mempool_free (uc->mempool, counter);
+}
diff --git a/sc/src/sc_unique_counter.h b/sc/src/sc_unique_counter.h
new file mode 100644
index 0000000..8024da0
--- /dev/null
+++ b/sc/src/sc_unique_counter.h
@@ -0,0 +1,76 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#ifndef SC_UNIQUE_COUNTER_H
+#define SC_UNIQUE_COUNTER_H
+
+#include <sc_containers.h>
+
+typedef struct sc_unique_counter
+{
+  int                 start_value;
+  sc_mempool_t       *mempool;
+}
+sc_unique_counter_t;
+
+/** Create a factory for unique tag numbers.
+ * The first tag number created will be start_value.
+ * Subsequent ones are counted forward from there.
+ * If a counter is released, it will be reactivated by the next creation.
+ * \param [in] start_value      Value of the first counter to be added.
+ * \return                      Fully initialized counter factory.
+ */
+sc_unique_counter_t *sc_unique_counter_new (int start_value);
+
+/** Destroy the counter factor and all counters created from it.
+ * All counters added must have been released before calling this function.
+ * \param [in,out] uc           This memory will be released.
+ */
+void                sc_unique_counter_destroy (sc_unique_counter_t * uc);
+
+/** Return the size in bytes allocated by this counter factory.
+ * \param [in,out] uc           Its total memory used will be counted.
+ */
+size_t              sc_unique_counter_memory_used (sc_unique_counter_t * uc);
+
+/** Request and return a counter with a unique integer value.
+ * The memory return is borrowed and still being owned by \a uc.  The same
+ * number will never be returned twice, unless the counter has been released
+ * first.
+ * \param [in,out] uc           The factory to return a unique counter.
+ * \return                      Pointer to internal memory.
+ *                              The unique counter value is accessed by simply
+ *                              dereferencing the int pointer.
+ */
+int                *sc_unique_counter_add (sc_unique_counter_t * uc);
+
+/** Release and return a counter to the factory.
+ * It will be reactivated on a subsequent call to sc_unique_counter_add.
+ * \param [in,out] uc           The factory to return a unique counter.
+ * \param [in] counter          This must be a pointer previously obtained from
+ *                              sc_unique_counter_add and not since released.
+ */
+void                sc_unique_counter_release (sc_unique_counter_t * uc,
+                                               int *counter);
+
+#endif /* !SC_UNIQUE_COUNTER */
diff --git a/sc/src/sc_warp.c b/sc/src/sc_warp.c
index 70315bd..3f3b119 100644
--- a/sc/src/sc_warp.c
+++ b/sc/src/sc_warp.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/src/sc_warp.h b/sc/src/sc_warp.h
index d962430..d40cd86 100644
--- a/sc/src/sc_warp.h
+++ b/sc/src/sc_warp.h
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/Makefile.am b/sc/test/Makefile.am
index 3c01716..e644e72 100644
--- a/sc/test/Makefile.am
+++ b/sc/test/Makefile.am
@@ -7,10 +7,12 @@ sc_test_programs = \
         test/sc_test_allgather \
         test/sc_test_arrays \
         test/sc_test_builtin \
+        test/sc_test_darray_work \
         test/sc_test_dmatrix \
         test/sc_test_dmatrix_pool \
         test/sc_test_io_sink \
         test/sc_test_keyvalue \
+        test/sc_test_node_comm \
         test/sc_test_notify \
         test/sc_test_reduce \
         test/sc_test_search \
@@ -24,11 +26,13 @@ check_PROGRAMS += $(sc_test_programs)
 test_sc_test_allgather_SOURCES = test/test_allgather.c
 test_sc_test_arrays_SOURCES = test/test_arrays.c
 test_sc_test_builtin_SOURCES = test/test_builtin.c
+test_sc_test_darray_work_SOURCES = test/test_darray_work.c
 test_sc_test_dmatrix_SOURCES = test/test_dmatrix.c
 test_sc_test_dmatrix_pool_SOURCES = test/test_dmatrix_pool.c
 test_sc_test_io_sink_SOURCES = test/test_io_sink.c
 test_sc_test_keyvalue_SOURCES = test/test_keyvalue.c
 test_sc_test_notify_SOURCES = test/test_notify.c
+test_sc_test_node_comm_SOURCES = test/test_node_comm.c
 ## Reenable and properly verify pqueue when it is actually used
 ## test_sc_test_pqueue_SOURCES = test/test_pqueue.c
 test_sc_test_reduce_SOURCES = test/test_reduce.c
@@ -42,6 +46,7 @@ LINT_CSOURCES += \
         $(test_sc_test_allgather_SOURCES) \
         $(test_sc_test_arrays_SOURCES) \
         $(test_sc_test_builtin_SOURCES) \
+        $(test_sc_test_darray_work) \
         $(test_sc_test_dmatrix_SOURCES) \
         $(test_sc_test_dmatrix_pool_SOURCES) \
         $(test_sc_test_io_sink_SOURCES) \
diff --git a/sc/test/test_allgather.c b/sc/test/test_allgather.c
index c96447e..b073a39 100644
--- a/sc/test/test_allgather.c
+++ b/sc/test/test_allgather.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_arrays.c b/sc/test/test_arrays.c
index 7d1720c..e3c713c 100644
--- a/sc/test/test_arrays.c
+++ b/sc/test/test_arrays.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -42,12 +43,16 @@ test_new_size (sc_array_t * a)
 {
   const size_t        S = a->elem_size;
   const size_t        N = a->elem_count;
+  size_t              zz;
   sc_array_t         *v, *w;
 
   v = sc_array_new_size (S, N);
   SC_CHECK_ABORT (v->elem_size == S && S == sizeof (int), "Size mismatch");
   SC_CHECK_ABORT (v->elem_count == N && N > 0, "Count mismatch");
   SC_CHECK_ABORT (v->byte_alloc <= a->byte_alloc, "Alloc mismatch");
+  for (zz = 0; zz < N; ++zz) {
+    *(int *) sc_array_index (v, zz) = (int) zz;
+  }
 
   w = sc_array_new (S);
   sc_array_copy (w, v);
diff --git a/sc/test/test_builtin.c b/sc/test/test_builtin.c
index dfabe98..3f1548b 100644
--- a/sc/test/test_builtin.c
+++ b/sc/test/test_builtin.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_darray_work.c b/sc/test/test_darray_work.c
new file mode 100644
index 0000000..d7d3c7d
--- /dev/null
+++ b/sc/test/test_darray_work.c
@@ -0,0 +1,81 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+#include <sc_dmatrix.h>
+
+int
+main (int argc, char **argv)
+{
+  const int           n_threads = 4;
+  const int           n_blocks = 19;
+  const int           n_entries = 31;
+#ifdef SC_MEMALIGN_BYTES
+  const int           memalign_bytes = SC_MEMALIGN_BYTES;
+#else
+  const int           memalign_bytes = 32;
+#endif
+  int                 mpiret;
+  sc_darray_work_t   *work;
+  double             *workd;
+  int                 t, b, i;
+
+  /* initialize mpi */
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+
+  /* initialize sc */
+  sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
+
+  /* allocate workspace */
+  work = sc_darray_work_new (n_threads, n_blocks, n_entries, memalign_bytes);
+
+  /* check size of allocation */
+  SC_CHECK_ABORTF (n_blocks == sc_darray_work_get_blockcount (work),
+                   "Wrong number of blocks %i, should be %i\n",
+                   sc_darray_work_get_blockcount (work), n_blocks);
+  SC_CHECK_ABORTF (n_entries <= sc_darray_work_get_blocksize (work),
+                   "Insufficient number of entries per block %i, should be at least %i\n",
+                   sc_darray_work_get_blocksize (work), n_entries);
+
+  /* write to all entries of workspace */
+  for (t = 0; t < n_threads; t++) {
+    for (b = 0; b < n_blocks; b++) {
+      workd = sc_darray_work_get (work, t, b);
+      for (i = 0; i < n_entries; i++) {
+        workd[i] = (double) i;
+      }
+    }
+  }
+
+  /* destroy */
+  sc_darray_work_destroy (work);
+
+  /* finalize sc */
+  sc_finalize ();
+
+  /* finalize mpi */
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+
+  return 0;
+}
diff --git a/sc/test/test_dmatrix.c b/sc/test/test_dmatrix.c
index f68a8d1..2f4b9a1 100644
--- a/sc/test/test_dmatrix.c
+++ b/sc/test/test_dmatrix.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -22,10 +23,68 @@
 
 #include <sc_dmatrix.h>
 
-#if defined(SC_WITH_BLAS) && defined(SC_WITH_LAPACK)
+#define TEST_DMATRIX_M 4
+#define TEST_DMATRIX_N 13
 
+#if defined(SC_WITH_BLAS) && defined(SC_WITH_LAPACK)
 static const double eps = 2.220446049250313e-16;
+#endif
+
+/**
+ * Generates a random number uniformly distributed in `[alpha,beta)`.
+ * \return  random number.
+ */
+static double
+test_dmatrix_get_random_uniform (const double alpha, const double beta)
+{
+  return alpha + (rand () / (RAND_MAX / (beta - alpha)));
+}
 
+/**
+ * Fills a dmatrix with random numbers.
+ */
+static void
+test_dmatrix_set_random (sc_dmatrix_t * mat, const double alpha,
+                         const double beta)
+{
+  const sc_bint_t     totalsize = mat->m * mat->n;
+  double             *mat_data = mat->e[0];
+  sc_bint_t           i;
+
+  for (i = 0; i < totalsize; ++i) {
+    mat_data[i] = test_dmatrix_get_random_uniform (alpha, beta);
+  }
+}
+
+/**
+ * Checks entries of a matrix comparing to a reference matrix.
+ * \return  number of non-identical entries.
+ */
+static              sc_bint_t
+test_dmatrix_check_error_identical (const sc_dmatrix_t * mat_chk,
+                                    const sc_dmatrix_t * mat_ref)
+{
+  const sc_bint_t     totalsize = mat_chk->m * mat_chk->n;
+  double             *mat_chk_data = mat_chk->e[0];
+  double             *mat_ref_data = mat_ref->e[0];
+  sc_bint_t           i;
+  sc_bint_t           error_count = 0;
+
+  SC_ASSERT (totalsize == mat_ref->m * mat_ref->n);
+
+  for (i = 0; i < totalsize; ++i) {
+    if (DBL_MIN < fabs (mat_chk_data[i] - mat_ref_data[i])) {
+      error_count++;
+    }
+  }
+
+  return error_count;
+}
+
+#if defined(SC_WITH_BLAS) && defined(SC_WITH_LAPACK)
+/**
+ * Tests multiplication with matrices of zero number of rows or columns or both.
+ */
 static void
 test_zero_sizes (void)
 {
@@ -45,17 +104,107 @@ test_zero_sizes (void)
   sc_dmatrix_destroy (m2);
   sc_dmatrix_destroy (m3);
 }
+#endif
+
+/**
+ * Tests function
+ *   sc_dmatrix_scale_shift
+ * against
+ *   sc_dmatrix_scale -> sc_dmatrix_shift
+ *
+ * \return  number of entries with errors.
+ */
+static int
+test_scale_shift ()
+{
+  const double        scale = M_PI;
+  const double        shift = M_E;
+  sc_dmatrix_t       *mat_chk, *mat_ref;
+  sc_bint_t           n_err_entries;
+
+  /* create & fill matrices with random values */
+  mat_chk = sc_dmatrix_new (TEST_DMATRIX_M, TEST_DMATRIX_N);
+  mat_ref = sc_dmatrix_new (TEST_DMATRIX_M, TEST_DMATRIX_N);
+  test_dmatrix_set_random (mat_chk, 0.0, 1.0);
+  sc_dmatrix_copy (mat_chk, mat_ref);
+
+  /* compute via function that's being tested */
+  sc_dmatrix_scale_shift (scale, shift, mat_chk);
+
+  /* compute reference */
+  sc_dmatrix_scale (scale, mat_ref);
+  sc_dmatrix_shift (shift, mat_ref);
+
+  /* check error */
+  n_err_entries = test_dmatrix_check_error_identical (mat_chk, mat_ref);
+
+  /* destroy */
+  sc_dmatrix_destroy (mat_chk);
+  sc_dmatrix_destroy (mat_ref);
+
+  /* return number of entries with errors */
+  return (int) n_err_entries;
+}
 
+/**
+ * Tests function
+ *   sc_dmatrix_dotmultiply_add
+ * against
+ *   sc_dmatrix_dotmultiply -> sc_dmatrix_add
+ *
+ * \return  number of entries with errors.
+ */
+static int
+test_dotmultiply_add ()
+{
+  sc_bint_t           n_err_entries = 0;
+
+#if defined(SC_WITH_BLAS)
+  sc_dmatrix_t       *mat_in, *mat_mult;
+  sc_dmatrix_t       *mat_chk, *mat_ref;
+
+  /* create & fill matrices with random values */
+  mat_in = sc_dmatrix_new (TEST_DMATRIX_M, TEST_DMATRIX_N);
+  mat_mult = sc_dmatrix_new (TEST_DMATRIX_M, TEST_DMATRIX_N);
+  mat_chk = sc_dmatrix_new (TEST_DMATRIX_M, TEST_DMATRIX_N);
+  mat_ref = sc_dmatrix_new (TEST_DMATRIX_M, TEST_DMATRIX_N);
+  test_dmatrix_set_random (mat_in, 0.0, 1.0);
+  test_dmatrix_set_random (mat_mult, 0.0, 1.0);
+  test_dmatrix_set_random (mat_chk, 0.0, 1.0);
+  sc_dmatrix_copy (mat_chk, mat_ref);
+
+  /* compute via function that's being tested */
+  sc_dmatrix_dotmultiply_add (mat_mult, mat_in, mat_chk);
+
+  /* compute reference */
+  sc_dmatrix_dotmultiply (mat_in, mat_mult);
+  sc_dmatrix_add (1.0, mat_mult, mat_ref);
+
+  /* check error */
+  n_err_entries = test_dmatrix_check_error_identical (mat_chk, mat_ref);
+
+  /* destroy */
+  sc_dmatrix_destroy (mat_in);
+  sc_dmatrix_destroy (mat_mult);
+  sc_dmatrix_destroy (mat_chk);
+  sc_dmatrix_destroy (mat_ref);
 #endif
 
+  /* return number of entries with errors */
+  return (int) n_err_entries;
+}
+
+/**
+ * Runs all dmatrix tests.
+ */
 int
 main (int argc, char **argv)
 {
   int                 num_failed_tests = 0;
+  int                 mpiret, testret;
 #if defined(SC_WITH_BLAS) && defined(SC_WITH_LAPACK)
   int                 j;
-  int                 mpiret;
-  sc_dmatrix_t       *A, *x, *xexact, *b, *bT, *xT, *xTexact;
+  sc_dmatrix_t       *A, *x, *xexact, *b, *bT, *xT, *xTexact, *A2, *b2;
   double              xmaxerror = 0.0;
   double              A_data[] = { 8.0, 1.0, 6.0,
     3.0, 5.0, 7.0,
@@ -63,17 +212,23 @@ main (int argc, char **argv)
   };
   double              b_data[] = { 1.0, 2.0, 3.0 };
   double              xexact_data[] = { -0.1 / 3.0, 1.4 / 3.0, -0.1 / 3.0 };
+#endif
 
   mpiret = sc_MPI_Init (&argc, &argv);
   SC_CHECK_MPI (mpiret);
 
   sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
 
+#if defined(SC_WITH_BLAS) && defined(SC_WITH_LAPACK)
   A = sc_dmatrix_new_data (3, 3, A_data);
+  A2 = sc_dmatrix_clone (A);
   b = sc_dmatrix_new_data (1, 3, b_data);
+  b2 = sc_dmatrix_clone (b);
   xexact = sc_dmatrix_new_data (1, 3, xexact_data);
   x = sc_dmatrix_new (1, 3);
 
+  /* Test 1: solve with A from the right */
+
   sc_dmatrix_rdivide (SC_NO_TRANS, b, A, x);
 
   sc_dmatrix_add (-1.0, xexact, x);
@@ -89,6 +244,25 @@ main (int argc, char **argv)
     ++num_failed_tests;
   }
 
+  /* Test 2: solve with A^T from the left */
+
+  sc_dmatrix_solve_transpose_inplace (A2, b2);
+
+  sc_dmatrix_add (-1.0, xexact, b2);
+
+  xmaxerror = 0.0;
+  for (j = 0; j < 3; ++j) {
+    xmaxerror = SC_MAX (xmaxerror, fabs (x->e[0][j]));
+  }
+
+  SC_LDEBUGF ("xmaxerror = %g\n", xmaxerror);
+
+  if (xmaxerror > 100.0 * eps) {
+    ++num_failed_tests;
+  }
+
+  /* Test 3: solve with A^T from the right */
+
   xexact->e[0][0] = 0.05;
   xexact->e[0][1] = 0.3;
   xexact->e[0][2] = 0.05;
@@ -108,6 +282,8 @@ main (int argc, char **argv)
     ++num_failed_tests;
   }
 
+  /* Test 4: solve with A from the left */
+
   bT = sc_dmatrix_new_data (3, 1, b_data);
   xT = sc_dmatrix_new (3, 1);
   xTexact = sc_dmatrix_new (3, 1);
@@ -138,14 +314,33 @@ main (int argc, char **argv)
   sc_dmatrix_destroy (b);
   sc_dmatrix_destroy (x);
   sc_dmatrix_destroy (xexact);
+  sc_dmatrix_destroy (A2);
+  sc_dmatrix_destroy (b2);
 
   test_zero_sizes ();
+#endif
 
+  /* Test 5: scale & shift */
+  testret = test_scale_shift ();
+  SC_LDEBUGF ("test_scale_shift: #entries with errors = %i\n", testret);
+  if (testret != 0) {
+    ++num_failed_tests;
+  }
+
+  /* Test 6: dotmultiply & add */
+  testret = test_dotmultiply_add ();
+  SC_LDEBUGF ("test_dotmultiply_add: #entries with errors = %i\n", testret);
+  if (testret != 0) {
+    ++num_failed_tests;
+  }
+
+  /* finalize sc */
   sc_finalize ();
 
+  /* finalize mpi */
   mpiret = sc_MPI_Finalize ();
   SC_CHECK_MPI (mpiret);
-#endif
 
+  /* return number of failed tests */
   return num_failed_tests;
 }
diff --git a/sc/test/test_dmatrix_pool.c b/sc/test/test_dmatrix_pool.c
index 59aabd1..f5c9621 100644
--- a/sc/test/test_dmatrix_pool.c
+++ b/sc/test/test_dmatrix_pool.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_io_sink.c b/sc/test/test_io_sink.c
index b8b9787..7c53371 100644
--- a/sc/test/test_io_sink.c
+++ b/sc/test/test_io_sink.c
@@ -2,7 +2,8 @@
   This file is part of the SC Library.
   The SC Library provides support for parallel scientific applications.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_keyvalue.c b/sc/test/test_keyvalue.c
index 019b618..0e11542 100644
--- a/sc/test/test_keyvalue.c
+++ b/sc/test/test_keyvalue.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_node_comm.c b/sc/test/test_node_comm.c
new file mode 100644
index 0000000..cf5c4ff
--- /dev/null
+++ b/sc/test/test_node_comm.c
@@ -0,0 +1,147 @@
+/*
+  This file is part of the SC Library.
+  The SC Library provides support for parallel scientific applications.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+
+  The SC Library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  The SC Library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with the SC Library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+*/
+
+/* activate random & srandom functions */
+#if !defined(_XOPEN_SOURCE)
+#define _XOPEN_SOURCE 500
+#elif defined(_XOPEN_SOURCE) && _XOPEN_SOURCE < 500
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 500
+#endif
+
+#include <sc.h>
+#include <sc_mpi.h>
+#include <sc_shmem.h>
+
+int
+test_shmem (int count, sc_MPI_Comm comm, sc_shmem_type_t type)
+{
+  int                 i, p, size, mpiret, check;
+  long int           *myval, *recv_self, *recv_shmem, *scan_self, *scan_shmem,
+                     *copy_shmem;
+
+  sc_shmem_set_type (comm, type);
+
+  mpiret = sc_MPI_Comm_size (comm, &size);
+  SC_CHECK_MPI (mpiret);
+
+  myval = SC_ALLOC (long int, count);
+  for (i = 0; i < count; i++) {
+    myval[i] = random ();
+  }
+
+  recv_self = SC_ALLOC (long int, count * size);
+  scan_self = SC_ALLOC (long int, count * (size + 1));
+  mpiret = sc_MPI_Allgather (myval, count, sc_MPI_LONG,
+                             recv_self, count, sc_MPI_LONG, comm);
+  SC_CHECK_MPI (mpiret);
+
+  for (i = 0; i < count; i++) {
+    scan_self[i] = 0;
+  }
+  for (p = 0; p < size; p++) {
+    for (i = 0; i < count; i++) {
+      scan_self[count * (p + 1) + i] =
+        scan_self[count * p + i] + recv_self[count * p + i];
+    }
+  }
+
+  recv_shmem = SC_SHMEM_ALLOC (long int, (size_t) count * size, comm);
+  sc_shmem_allgather (myval, count, sc_MPI_LONG,
+                      recv_shmem, count, sc_MPI_LONG, comm);
+  check = memcmp (recv_self, recv_shmem, count * sizeof (long int) * size);
+  if (check) {
+    SC_GLOBAL_LERROR ("sc_shmem_allgather mismatch\n");
+    return 1;
+  }
+
+  copy_shmem = SC_SHMEM_ALLOC (long int, (size_t) count * size, comm);
+  sc_shmem_memcpy (copy_shmem, recv_shmem,
+                   (size_t) count * sizeof (long int) * size, comm);
+  check = memcmp (recv_shmem, copy_shmem, count * sizeof (long int) * size);
+  if (check) {
+    SC_GLOBAL_LERROR ("sc_shmem_copy mismatch\n");
+    return 1;
+  }
+  SC_SHMEM_FREE (copy_shmem, comm);
+  SC_SHMEM_FREE (recv_shmem, comm);
+
+  scan_shmem = SC_SHMEM_ALLOC (long int, (size_t) count * (size + 1), comm);
+  sc_shmem_prefix (myval, scan_shmem, count, sc_MPI_LONG, sc_MPI_SUM, comm);
+  check =
+    memcmp (scan_self, scan_shmem, count * sizeof (long int) * (size + 1));
+  if (check) {
+    SC_GLOBAL_LERROR ("sc_shmem_prefix mismatch\n");
+    return 2;
+  }
+  SC_SHMEM_FREE (scan_shmem, comm);
+
+  SC_FREE (scan_self);
+  SC_FREE (recv_self);
+  SC_FREE (myval);
+  return 0;
+}
+
+int
+main (int argc, char **argv)
+{
+  int                 mpiret, rank, size;
+  int                 count;
+  int                 itype;
+  int                 retval = 0;
+
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (sc_MPI_COMM_WORLD, &rank);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_size (sc_MPI_COMM_WORLD, &size);
+  SC_CHECK_MPI (mpiret);
+
+  sc_init (sc_MPI_COMM_WORLD, 1, 1, NULL, SC_LP_DEFAULT);
+
+  srandom (rank);
+  for (itype = 0; itype < (int) SC_SHMEM_NUM_TYPES; itype++) {
+
+    SC_GLOBAL_PRODUCTIONF ("sc_shmem type: %s\n",
+                           sc_shmem_type_to_string[itype]);
+    for (count = 1; count <= 3; count++) {
+      int                 retvalin = retval;
+
+      SC_GLOBAL_PRODUCTIONF ("  count = %d\n", count);
+      retval +=
+        test_shmem (count, sc_MPI_COMM_WORLD, (sc_shmem_type_t) itype);
+      if (retval != retvalin) {
+        SC_GLOBAL_PRODUCTION ("    unsuccessful\n");
+      }
+      else {
+        SC_GLOBAL_PRODUCTION ("    successful\n");
+      }
+    }
+  }
+
+  sc_finalize ();
+
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+  return retval;
+}
diff --git a/sc/test/test_notify.c b/sc/test/test_notify.c
index 283b8ed..e31471b 100644
--- a/sc/test/test_notify.c
+++ b/sc/test/test_notify.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_pqueue.c b/sc/test/test_pqueue.c
index 6587fac..93b2953 100644
--- a/sc/test/test_pqueue.c
+++ b/sc/test/test_pqueue.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_reduce.c b/sc/test/test_reduce.c
index 0bd6436..2dfb6ab 100644
--- a/sc/test/test_reduce.c
+++ b/sc/test/test_reduce.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_search.c b/sc/test/test_search.c
index fa5e4aa..ee050af 100644
--- a/sc/test/test_search.c
+++ b/sc/test/test_search.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_sort.c b/sc/test/test_sort.c
index 34daa1f..7645dc2 100644
--- a/sc/test/test_sort.c
+++ b/sc/test/test_sort.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/sc/test/test_sortb.c b/sc/test/test_sortb.c
index 802b422..13cfba1 100644
--- a/sc/test/test_sortb.c
+++ b/sc/test/test_sortb.c
@@ -3,6 +3,7 @@
   The SC Library provides support for parallel scientific applications.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
 
   The SC Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
diff --git a/src/Makefile.am b/src/Makefile.am
index ca06c11..70f36f6 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,6 +25,7 @@ libp4est_compiled_sources += \
         src/p4est_points.c src/p4est_geometry.c \
         src/p4est_iterate.c src/p4est_lnodes.c src/p4est_mesh.c \
         src/p4est_balance.c src/p4est_io.c \
+        src/p4est_connrefine.c \
         src/p4est_wrap.c src/p4est_plex.c
 endif
 if P4EST_ENABLE_BUILD_3D
@@ -46,9 +47,23 @@ libp4est_compiled_sources += \
         src/p8est_points.c src/p8est_geometry.c \
         src/p8est_iterate.c src/p8est_lnodes.c src/p8est_mesh.c \
         src/p8est_tets_hexes.c src/p8est_balance.c src/p8est_io.c \
+        src/p8est_connrefine.c \
         src/p8est_wrap.c src/p8est_plex.c
 endif
-include example/p6est/Makefile.am
+if P4EST_ENABLE_BUILD_2D
+if P4EST_ENABLE_BUILD_3D
+if P4EST_ENABLE_BUILD_P6EST
+libp4est_installed_headers += \
+        src/p6est.h src/p6est_ghost.h src/p6est_lnodes.h \
+        src/p6est_profile.h src/p6est_vtk.h \
+        src/p6est_extended.h src/p6est_communication.h
+libp4est_compiled_sources += \
+        src/p6est.c src/p6est_ghost.c src/p6est_lnodes.c \
+        src/p6est_profile.c src/p6est_vtk.c \
+        src/p6est_communication.c
+endif
+endif
+endif
 
 # this variable is used for headers that are not publicly installed
 P4EST_CPPFLAGS =
diff --git a/src/p4est.c b/src/p4est.c
index 76eebf6..dac3f92 100644
--- a/src/p4est.c
+++ b/src/p4est.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -147,11 +148,17 @@ p4est_qcoord_to_vertex (p4est_connectivity_t * connectivity,
 size_t
 p4est_memory_used (p4est_t * p4est)
 {
-  const int           mpisize = p4est->mpisize;
+  int                 mpisize;
   size_t              size;
   p4est_topidx_t      nt;
   p4est_tree_t       *tree;
 
+  /* do not assert p4est_is_valid since it is collective */
+  P4EST_ASSERT (p4est != NULL);
+  P4EST_ASSERT (p4est->connectivity != NULL);
+  P4EST_ASSERT (p4est->trees != NULL);
+
+  mpisize = p4est->mpisize;
   size = sizeof (p4est_t) +
     (mpisize + 1) * (sizeof (p4est_gloidx_t) + sizeof (p4est_quadrant_t));
 
@@ -162,13 +169,25 @@ p4est_memory_used (p4est_t * p4est)
   }
 
   if (p4est->data_size > 0) {
+    P4EST_ASSERT (p4est->user_data_pool != NULL);
     size += sc_mempool_memory_used (p4est->user_data_pool);
   }
+  P4EST_ASSERT (p4est->quadrant_pool != NULL);
   size += sc_mempool_memory_used (p4est->quadrant_pool);
 
   return size;
 }
 
+long
+p4est_revision (p4est_t * p4est)
+{
+  /* do not assert p4est_is_valid since it is collective */
+  P4EST_ASSERT (p4est != NULL);
+  P4EST_ASSERT (p4est->revision >= 0);
+
+  return p4est->revision;
+}
+
 p4est_t            *
 p4est_new (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
            size_t data_size, p4est_init_t init_fn, void *user_pointer)
@@ -182,7 +201,6 @@ p4est_new_ext (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
                p4est_locidx_t min_quadrants, int min_level, int fill_uniform,
                size_t data_size, p4est_init_t init_fn, void *user_pointer)
 {
-  int                 mpiret;
   int                 num_procs, rank;
   int                 i, must_remove_last_quadrant;
   int                 level;
@@ -208,22 +226,18 @@ p4est_new_ext (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
   P4EST_ASSERT (p4est_connectivity_is_valid (connectivity));
   P4EST_ASSERT (min_level <= P4EST_QMAXLEVEL);
 
-  /* retrieve MPI information */
-  mpiret = sc_MPI_Comm_size (mpicomm, &num_procs);
-  SC_CHECK_MPI (mpiret);
-  mpiret = sc_MPI_Comm_rank (mpicomm, &rank);
-  SC_CHECK_MPI (mpiret);
-
-  /* assign some data members */
+  /* create p4est object and assign some data members */
   p4est = P4EST_ALLOC_ZERO (p4est_t, 1);
-  p4est->mpicomm = mpicomm;
-  p4est->mpisize = num_procs;
-  p4est->mpirank = rank;
   p4est->data_size = data_size;
   p4est->user_pointer = user_pointer;
   p4est->connectivity = connectivity;
   num_trees = connectivity->num_trees;
 
+  /* set parallel environment */
+  p4est_comm_parallel_env_assign (p4est, mpicomm);
+  num_procs = p4est->mpisize;
+  rank = p4est->mpirank;
+
   /* allocate memory pools */
   if (p4est->data_size > 0) {
     p4est->user_data_pool = sc_mempool_new (p4est->data_size);
@@ -471,6 +485,7 @@ p4est_new_ext (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
   P4EST_VERBOSEF ("total local quadrants %lld\n",
                   (long long) p4est->local_num_quadrants);
 
+  P4EST_ASSERT (p4est->revision == 0);
   P4EST_ASSERT (p4est_is_valid (p4est));
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTIONF ("Done " P4EST_STRING
@@ -508,6 +523,7 @@ p4est_destroy (p4est_t * p4est)
   }
   sc_mempool_destroy (p4est->quadrant_pool);
 
+  p4est_comm_parallel_env_release (p4est);
   P4EST_FREE (p4est->global_first_quadrant);
   P4EST_FREE (p4est->global_first_position);
   P4EST_FREE (p4est);
@@ -516,6 +532,12 @@ p4est_destroy (p4est_t * p4est)
 p4est_t            *
 p4est_copy (p4est_t * input, int copy_data)
 {
+  return p4est_copy_ext (input, copy_data, 0 /* don't duplicate MPI comm */ );
+}
+
+p4est_t            *
+p4est_copy_ext (p4est_t * input, int copy_data, int duplicate_mpicomm)
+{
   const p4est_topidx_t num_trees = input->connectivity->num_trees;
   const p4est_topidx_t first_tree = input->first_local_tree;
   const p4est_topidx_t last_tree = input->last_local_tree;
@@ -536,6 +558,12 @@ p4est_copy (p4est_t * input, int copy_data)
   p4est->user_data_pool = NULL;
   p4est->quadrant_pool = NULL;
 
+  /* set parallel environment */
+  p4est_comm_parallel_env_assign (p4est, input->mpicomm);
+  if (duplicate_mpicomm) {
+    p4est_comm_parallel_env_duplicate (p4est);
+  }
+
   /* allocate a user data pool if necessary and a quadrant pool */
   if (copy_data && p4est->data_size > 0) {
     p4est->user_data_pool = sc_mempool_new (p4est->data_size);
@@ -586,6 +614,9 @@ p4est_copy (p4est_t * input, int copy_data)
   memcpy (p4est->global_first_position, input->global_first_position,
           (p4est->mpisize + 1) * sizeof (p4est_quadrant_t));
 
+  /* the copy starts with a revision count of zero */
+  p4est->revision = 0;
+
   /* check for valid p4est and return */
   P4EST_ASSERT (p4est_is_valid (p4est));
 
@@ -658,6 +689,7 @@ p4est_refine_ext (p4est_t * p4est, int refine_recursive, int allowed_level,
   int                 firsttime;
   int                 i, maxlevel;
   p4est_topidx_t      nt;
+  p4est_gloidx_t      old_gnq;
   size_t              incount, current, restpos, movecount;
   sc_list_t          *list;
   p4est_tree_t       *tree;
@@ -683,6 +715,9 @@ p4est_refine_ext (p4est_t * p4est, int refine_recursive, int allowed_level,
   P4EST_ASSERT (0 <= allowed_level && allowed_level <= P4EST_QMAXLEVEL);
   P4EST_ASSERT (refine_fn != NULL);
 
+  /* remember input quadrant count; it will not decrease */
+  old_gnq = p4est->global_num_quadrants;
+
   /*
      q points to a quadrant that is an array member
      qalloc is a quadrant that has been allocated through quadrant_pool
@@ -740,7 +775,7 @@ p4est_refine_ext (p4est_t * p4est, int refine_recursive, int allowed_level,
     qalloc = p4est_quadrant_mempool_alloc (p4est->quadrant_pool);
     *qalloc = *q;               /* never prepend array members directly */
     qalloc->pad8 = 0;           /* this quadrant has not been refined yet */
-    sc_list_prepend (list, qalloc);     /* only newly allocated quadrants */
+    (void) sc_list_prepend (list, qalloc);      /* only new quadrants */
 
     P4EST_QUADRANT_INIT (&parent);
 
@@ -798,15 +833,15 @@ p4est_refine_ext (p4est_t * p4est, int refine_recursive, int allowed_level,
         p4est_quadrant_init_data (p4est, nt, c7, init_fn);
         c4->pad8 = c5->pad8 = c6->pad8 = c7->pad8 = 1;
 
-        sc_list_prepend (list, c7);
-        sc_list_prepend (list, c6);
-        sc_list_prepend (list, c5);
-        sc_list_prepend (list, c4);
+        (void) sc_list_prepend (list, c7);
+        (void) sc_list_prepend (list, c6);
+        (void) sc_list_prepend (list, c5);
+        (void) sc_list_prepend (list, c4);
 #endif
-        sc_list_prepend (list, c3);
-        sc_list_prepend (list, c2);
-        sc_list_prepend (list, c1);
-        sc_list_prepend (list, c0);
+        (void) sc_list_prepend (list, c3);
+        (void) sc_list_prepend (list, c2);
+        (void) sc_list_prepend (list, c1);
+        (void) sc_list_prepend (list, c0);
 
         if (replace_fn != NULL) {
           /* in family mode we always call the replace callback right
@@ -834,7 +869,7 @@ p4est_refine_ext (p4est_t * p4est, int refine_recursive, int allowed_level,
             qalloc = p4est_quadrant_mempool_alloc (p4est->quadrant_pool);
             *qalloc = *q;       /* never append array members directly */
             qalloc->pad8 = 0;   /* has not been refined yet */
-            sc_list_append (list, qalloc);      /* only newly allocated quadrants */
+            (void) sc_list_append (list, qalloc);       /* only new quadrants */
             --movecount;
             ++restpos;
           }
@@ -878,6 +913,10 @@ p4est_refine_ext (p4est_t * p4est, int refine_recursive, int allowed_level,
 
   /* compute global number of quadrants */
   p4est_comm_count_quadrants (p4est);
+  P4EST_ASSERT (p4est->global_num_quadrants >= old_gnq);
+  if (old_gnq != p4est->global_num_quadrants) {
+    ++p4est->revision;
+  }
 
   P4EST_ASSERT (p4est_is_valid (p4est));
   p4est_log_indent_pop ();
@@ -909,6 +948,7 @@ p4est_coarsen_ext (p4est_t * p4est,
   size_t              window, start, length, cidz;
   p4est_locidx_t      num_quadrants, prev_offset;
   p4est_topidx_t      jt;
+  p4est_gloidx_t      old_gnq;
   p4est_tree_t       *tree;
   p4est_quadrant_t   *c[P4EST_CHILDREN];
   p4est_quadrant_t   *cfirst, *clast;
@@ -922,6 +962,9 @@ p4est_coarsen_ext (p4est_t * p4est,
   P4EST_ASSERT (p4est_is_valid (p4est));
   P4EST_ASSERT (coarsen_fn != NULL);
 
+  /* remember input quadrant count; it will not increase */
+  old_gnq = p4est->global_num_quadrants;
+
   P4EST_QUADRANT_INIT (&qtemp);
 
   /* loop over all local trees */
@@ -1075,6 +1118,10 @@ p4est_coarsen_ext (p4est_t * p4est,
 
   /* compute global number of quadrants */
   p4est_comm_count_quadrants (p4est);
+  P4EST_ASSERT (p4est->global_num_quadrants <= old_gnq);
+  if (old_gnq != p4est->global_num_quadrants) {
+    ++p4est->revision;
+  }
 
   P4EST_ASSERT (p4est_is_valid (p4est));
   p4est_log_indent_pop ();
@@ -1215,6 +1262,7 @@ p4est_balance_ext (p4est_t * p4est, p4est_connect_type_t btype,
   p4est_topidx_t      qtree, nt;
   p4est_topidx_t      first_tree, last_tree;
   p4est_locidx_t      skipped;
+  p4est_gloidx_t      old_gnq;
   p4est_balance_peer_t *peers, *peer;
   p4est_tree_t       *tree;
   p4est_quadrant_t    mylow, nextlow;
@@ -1285,6 +1333,9 @@ p4est_balance_ext (p4est_t * p4est, p4est_connect_type_t btype,
                 btype == P8EST_CONNECT_CORNER);
 #endif
 
+  /* remember input quadrant count; it will not decrease */
+  old_gnq = p4est->global_num_quadrants;
+
 #ifdef P4EST_ENABLE_DEBUG
   data_pool_size = 0;
   if (p4est->user_data_pool != NULL) {
@@ -1433,7 +1484,7 @@ p4est_balance_ext (p4est_t * p4est, p4est_connect_type_t btype,
       qh = P4EST_QUADRANT_LEN (q->level);
       if (p4est_comm_neighborhood_owned (p4est, nt,
                                          full_tree, tree_contact, q)) {
-        /* this quadrant's 3x3 neighborhood is onwed by this processor */
+        /* this quadrant's 3x3 neighborhood is owned by this processor */
         ++skipped;
         continue;
       }
@@ -1806,21 +1857,23 @@ p4est_balance_ext (p4est_t * p4est, p4est_connect_type_t btype,
 
   /* verify sc_ranges and sc_notify against each other */
   if (is_ranges_active && is_notify_active && is_balance_verify) {
+#ifdef P4EST_ENABLE_DEBUG
     int                 found_in_ranges, found_in_notify;
+#endif
 
     /* verify receiver side */
     P4EST_ASSERT (num_receivers_notify <= num_receivers_ranges);
     k = l = 0;
     for (j = 0; j < num_procs; ++j) {
-      found_in_ranges = found_in_notify = 0;
+      P4EST_DEBUG_EXECUTE (found_in_ranges = found_in_notify = 0);
       if (k < num_receivers_ranges && receiver_ranks_ranges[k] == j) {
         P4EST_ASSERT (j != rank);
-        found_in_ranges = 1;
+        P4EST_DEBUG_EXECUTE (found_in_ranges = 1);
         ++k;
       }
       if (l < num_receivers_notify && receiver_ranks_notify[l] == j) {
         P4EST_ASSERT (j != rank && found_in_ranges);
-        found_in_notify = 1;
+        P4EST_DEBUG_EXECUTE (found_in_notify = 1);
         ++l;
       }
       if (j != rank && peers[j].send_first.elem_count > 0) {
@@ -1837,15 +1890,15 @@ p4est_balance_ext (p4est_t * p4est, p4est_connect_type_t btype,
     P4EST_ASSERT (num_senders_notify <= num_senders_ranges);
     k = l = 0;
     for (j = 0; j < num_procs; ++j) {
-      found_in_ranges = found_in_notify = 0;
+      P4EST_DEBUG_EXECUTE (found_in_ranges = found_in_notify = 0);
       if (k < num_senders_ranges && sender_ranks_ranges[k] == j) {
         P4EST_ASSERT (j != rank);
-        found_in_ranges = 1;
+        P4EST_DEBUG_EXECUTE (found_in_ranges = 1);
         ++k;
       }
       if (l < num_senders_notify && sender_ranks_notify[l] == j) {
         P4EST_ASSERT (j != rank && found_in_ranges);
-        found_in_notify = 1;    /* kept for symmetry */
+        P4EST_DEBUG_EXECUTE (found_in_notify = 1);      /* for symmetry */
         ++l;
       }
     }
@@ -2346,6 +2399,10 @@ p4est_balance_ext (p4est_t * p4est, p4est_connect_type_t btype,
 
   /* compute global number of quadrants */
   p4est_comm_count_quadrants (p4est);
+  P4EST_ASSERT (p4est->global_num_quadrants >= old_gnq);
+  if (old_gnq != p4est->global_num_quadrants) {
+    ++p4est->revision;
+  }
 
   /* some sanity checks */
   P4EST_ASSERT ((p4est_locidx_t) all_outcount == p4est->local_num_quadrants);
@@ -2415,6 +2472,9 @@ p4est_partition_ext (p4est_t * p4est, int partition_for_coarsening,
   /* this function does nothing in a serial setup */
   if (p4est->mpisize == 1) {
     P4EST_GLOBAL_PRODUCTION ("Done " P4EST_STRING "_partition no shipping\n");
+
+    /* in particular, there is no need to bumb the revision counter */
+    P4EST_ASSERT (global_shipped == 0);
     return global_shipped;
   }
 
@@ -2425,7 +2485,7 @@ p4est_partition_ext (p4est_t * p4est, int partition_for_coarsening,
   num_quadrants_in_proc = P4EST_ALLOC (p4est_locidx_t, num_procs);
 
   if (weight_fn == NULL) {
-    /* Divide up the quadants equally */
+    /* Divide up the quadrants equally */
     for (p = 0, next_quadrant = 0; p < num_procs; ++p) {
       prev_quadrant = next_quadrant;
       next_quadrant =
@@ -2496,6 +2556,9 @@ p4est_partition_ext (p4est_t * p4est, int partition_for_coarsening,
       p4est_log_indent_pop ();
       P4EST_GLOBAL_PRODUCTION ("Done " P4EST_STRING
                                "_partition no shipping\n");
+
+      /* in particular, there is no need to bumb the revision counter */
+      P4EST_ASSERT (global_shipped == 0);
       return global_shipped;
     }
 
@@ -2698,6 +2761,10 @@ p4est_partition_ext (p4est_t * p4est, int partition_for_coarsening,
 
   /* run the partition algorithm with proper quadrant counts */
   global_shipped = p4est_partition_given (p4est, num_quadrants_in_proc);
+  if (global_shipped) {
+    /* the partition of the forest has changed somewhere */
+    ++p4est->revision;
+  }
   P4EST_FREE (num_quadrants_in_proc);
 
   /* check validity of the p4est */
@@ -2801,8 +2868,9 @@ p4est_partition_for_coarsening (p4est_t * p4est,
         quad_id_near_cut = partition_new[i];
       }
       else {
-        if (abs (partition_new[i] - partition_now[rank]) <
-            abs (partition_new[i] - partition_now[rank + 1] + 1)) {
+        if (P4EST_GLOIDX_ABS (partition_new[i] - partition_now[rank]) <
+            P4EST_GLOIDX_ABS (partition_new[i] - partition_now[rank + 1] +
+                              1)) {
           quad_id_near_cut = partition_now[rank];
         }
         else {
@@ -3117,7 +3185,8 @@ p4est_partition_for_coarsening (p4est_t * p4est,
     if (0 < current_proc && current_proc < num_procs) {
       /* if any process but first */
       num_quadrants_in_proc[current_proc] += correction[current_proc];
-      num_moved_quadrants += (p4est_gloidx_t) abs (correction[current_proc]);
+      /* input is just a locidx, but the result is gloidx so we cast cleanly */
+      num_moved_quadrants += P4EST_GLOIDX_ABS (correction[current_proc]);
     }
     if (next_proc < num_procs) {
       /* if first process or next process is feasible */
@@ -3462,6 +3531,7 @@ p4est_source_ext (sc_io_source_t * src, sc_MPI_Comm mpicomm, size_t data_size,
 {
   const int           headc = 6;
   const int           align = 32;
+  int                 root = 0;
   int                 retval;
   int                 mpiret;
   int                 num_procs, rank;
@@ -3469,6 +3539,7 @@ p4est_source_ext (sc_io_source_t * src, sc_MPI_Comm mpicomm, size_t data_size,
   int                 save_data;
   int                 i;
   uint64_t           *u64a, u64int;
+  size_t              conn_bytes, file_offset;
   size_t              save_data_size;
   size_t              qbuf_size, comb_size, head_count;
   size_t              zz, zcount, zpadding;
@@ -3481,7 +3552,13 @@ p4est_source_ext (sc_io_source_t * src, sc_MPI_Comm mpicomm, size_t data_size,
   sc_array_t         *qarr, *darr;
   char               *dap, *lbuf;
 
-  SC_CHECK_ABORT (!broadcasthead, "Header broadcast not implemented");
+  /* set some parameters */
+  P4EST_ASSERT (src->bytes_out == 0);
+  P4EST_ASSERT (connectivity != NULL);
+  if (data_size == 0) {
+    load_data = 0;
+  }
+  qbuf_size = (P4EST_DIM + 1) * sizeof (p4est_qcoord_t);
 
   /* retrieve MPI information */
   mpiret = sc_MPI_Comm_size (mpicomm, &num_procs);
@@ -3489,88 +3566,131 @@ p4est_source_ext (sc_io_source_t * src, sc_MPI_Comm mpicomm, size_t data_size,
   mpiret = sc_MPI_Comm_rank (mpicomm, &rank);
   SC_CHECK_MPI (mpiret);
 
-  /* read connectivity */
-  conn = *connectivity = p4est_connectivity_source (src);
-  SC_CHECK_ABORT (conn != NULL, "connectivity source");
-  zcount = src->bytes_out;
-  zpadding = (align - zcount % align) % align;
-  retval = sc_io_source_read (src, NULL, zpadding, NULL);
-  SC_CHECK_ABORT (!retval, "source padding");
+  /* the first part of the header determines further offsets */
+  conn = NULL;
+  conn_bytes = 0;
+  u64a = P4EST_ALLOC (uint64_t, headc + 1);
+  if (!broadcasthead || rank == root) {
+
+    /* read the forest connectivity */
+    conn = p4est_connectivity_source (src);
+    SC_CHECK_ABORT (conn != NULL, "connectivity source");
+    zcount = src->bytes_out;
+    zpadding = (align - zcount % align) % align;
+    retval = sc_io_source_read (src, NULL, zpadding, NULL);
+    SC_CHECK_ABORT (!retval, "source padding");
+    conn_bytes = src->bytes_out;
+
+    /* read format and some basic partition parameters */
+    retval = sc_io_source_read (src, u64a, sizeof (uint64_t) * (size_t) headc,
+                                NULL);
+    SC_CHECK_ABORT (!retval, "read format");
+    SC_CHECK_ABORT (u64a[0] == P4EST_ONDISK_FORMAT, "invalid format");
+    SC_CHECK_ABORT (u64a[1] == (uint64_t) sizeof (p4est_qcoord_t),
+                    "invalid coordinate size");
+    SC_CHECK_ABORT (u64a[2] == (uint64_t) sizeof (p4est_quadrant_t),
+                    "invalid quadrant size");
+    save_data_size = (size_t) u64a[3];
+    save_data = (int) u64a[4];
+    if (load_data) {
+      SC_CHECK_ABORT (save_data_size == data_size, "invalid data size");
+      SC_CHECK_ABORT (save_data, "quadrant data not saved");
+    }
+    save_num_procs = (int) u64a[5];
+    SC_CHECK_ABORT (autopartition || num_procs == save_num_procs,
+                    "num procs mismatch");
 
-  /* set some parameters */
-  if (data_size == 0) {
-    load_data = 0;
+    /* piggy-back the bytes for the connectivity onto first message */
+    u64a[headc + 0] = (uint64_t) conn_bytes;
   }
-  num_trees = conn->num_trees;
-  qbuf_size = (P4EST_DIM + 1) * sizeof (p4est_qcoord_t);
+  if (broadcasthead) {
 
-  /* read format and partition information */
-  u64a = P4EST_ALLOC (uint64_t, headc);
-  retval = sc_io_source_read (src, u64a, sizeof (uint64_t) * (size_t) headc,
-                              NULL);
-  SC_CHECK_ABORT (!retval, "read format");
-  SC_CHECK_ABORT (u64a[0] == P4EST_ONDISK_FORMAT, "invalid format");
-  SC_CHECK_ABORT (u64a[1] == (uint64_t) sizeof (p4est_qcoord_t),
-                  "invalid coordinate size");
-  SC_CHECK_ABORT (u64a[2] == (uint64_t) sizeof (p4est_quadrant_t),
-                  "invalid quadrant size");
-  save_data_size = (size_t) u64a[3];
-  save_data = (int) u64a[4];
-  if (load_data) {
-    SC_CHECK_ABORT (save_data_size == data_size, "invalid data size");
-    SC_CHECK_ABORT (save_data, "quadrant data not saved");
+    /* broadcast connectivity and first part of header */
+    conn = p4est_connectivity_bcast (conn, root, mpicomm);
+    mpiret = sc_MPI_Bcast (u64a, headc + 1, sc_MPI_LONG_LONG_INT,
+                           root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+    if (rank != root) {
+
+      /* make sure the rest of the processes has the information */
+      SC_CHECK_ABORT (u64a[0] == P4EST_ONDISK_FORMAT, "invalid format");
+      save_data_size = (size_t) u64a[3];
+      save_data = (int) u64a[4];
+      save_num_procs = (int) u64a[5];
+      conn_bytes = (size_t) u64a[headc + 0];
+    }
   }
-  save_num_procs = (int) u64a[5];
+  *connectivity = conn;
   comb_size = qbuf_size + save_data_size;
-  SC_CHECK_ABORT (autopartition || num_procs == save_num_procs,
-                  "num procs mismatch");
+  file_offset = conn_bytes + headc * sizeof (uint64_t);
 
   /* create partition data */
   gfq = P4EST_ALLOC (p4est_gloidx_t, num_procs + 1);
-  if (!autopartition) {
-    P4EST_ASSERT (num_procs == save_num_procs);
-    u64a = P4EST_REALLOC (u64a, uint64_t, num_procs);
-    sc_io_source_read (src, u64a, sizeof (uint64_t) * (size_t) num_procs,
-                       NULL);
-    SC_CHECK_ABORT (!retval, "read quadrant partition");
-    gfq[0] = 0;
-    for (i = 0; i < num_procs; ++i) {
-      gfq[i + 1] = (p4est_gloidx_t) u64a[i];
+  gfq[0] = 0;
+  if (!broadcasthead || rank == root) {
+    if (!autopartition) {
+      P4EST_ASSERT (num_procs == save_num_procs);
+      u64a = P4EST_REALLOC (u64a, uint64_t, num_procs);
+      sc_io_source_read (src, u64a, sizeof (uint64_t) * (size_t) num_procs,
+                         NULL);
+      SC_CHECK_ABORT (!retval, "read quadrant partition");
+      for (i = 0; i < num_procs; ++i) {
+        gfq[i + 1] = (p4est_gloidx_t) u64a[i];
+      }
     }
-  }
-  else {
-    /* ignore saved partition and compute a new uniform one */
-    retval = sc_io_source_read
-      (src, NULL, (long) ((save_num_procs - 1) * sizeof (uint64_t)), NULL);
-    SC_CHECK_ABORT (!retval, "seek over ignored partition");
-    retval = sc_io_source_read (src, &u64int, sizeof (uint64_t), NULL);
-    SC_CHECK_ABORT (!retval, "read quadrant count");
-    for (i = 0; i <= num_procs; ++i) {
-      gfq[i] = p4est_partition_cut_uint64 (u64int, i, num_procs);
+    else {
+      /* ignore saved partition and compute a new uniform one */
+      retval = sc_io_source_read
+        (src, NULL, (long) ((save_num_procs - 1) * sizeof (uint64_t)), NULL);
+      SC_CHECK_ABORT (!retval, "seek over ignored partition");
+      retval = sc_io_source_read (src, &u64int, sizeof (uint64_t), NULL);
+      SC_CHECK_ABORT (!retval, "read quadrant count");
+      for (i = 1; i <= num_procs; ++i) {
+        gfq[i] = p4est_partition_cut_uint64 (u64int, i, num_procs);
+      }
     }
   }
+  if (broadcasthead) {
+    mpiret = sc_MPI_Bcast (gfq + 1, num_procs, P4EST_MPI_GLOIDX,
+                           root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+  }
   zcount = (size_t) (gfq[rank + 1] - gfq[rank]);
+  file_offset += save_num_procs * sizeof (uint64_t);
 
   /* read pertree data */
-  u64a = P4EST_REALLOC (u64a, uint64_t, num_trees);
-  retval = sc_io_source_read (src, u64a, sizeof (uint64_t) * (size_t)
-                              num_trees, NULL);
-  SC_CHECK_ABORT (!retval, "read pertree information");
+  num_trees = conn->num_trees;
   pertree = P4EST_ALLOC (p4est_gloidx_t, num_trees + 1);
   pertree[0] = 0;
-  for (jt = 0; jt < num_trees; ++jt) {
-    pertree[jt + 1] = (p4est_gloidx_t) u64a[jt];
+  if (!broadcasthead || rank == root) {
+    u64a = P4EST_REALLOC (u64a, uint64_t, num_trees);
+    retval = sc_io_source_read (src, u64a, sizeof (uint64_t) * (size_t)
+                                num_trees, NULL);
+    SC_CHECK_ABORT (!retval, "read pertree information");
+    for (jt = 0; jt < num_trees; ++jt) {
+      pertree[jt + 1] = (p4est_gloidx_t) u64a[jt];
+    }
+    SC_CHECK_ABORT (gfq[num_procs] == pertree[num_trees], "pertree mismatch");
+  }
+  if (broadcasthead) {
+    mpiret = sc_MPI_Bcast (pertree + 1, num_trees, P4EST_MPI_GLOIDX,
+                           root, mpicomm);
+    SC_CHECK_MPI (mpiret);
   }
-  SC_CHECK_ABORT (gfq[num_procs] == pertree[num_trees], "pertree mismatch");
   P4EST_FREE (u64a);
+  file_offset += num_trees * sizeof (uint64_t);
 
   /* seek to the beginning of this processor's storage */
+  if (!broadcasthead || rank == root) {
+    P4EST_ASSERT (file_offset == src->bytes_out);
+    file_offset = 0;
+  }
   head_count = (size_t) (headc + save_num_procs) + (size_t) num_trees;
   zpadding = (align - (head_count * sizeof (uint64_t)) % align) % align;
   if (zpadding > 0 || rank > 0) {
-    retval =
-      sc_io_source_read (src, NULL, (long) (zpadding + gfq[rank] * comb_size),
-                         NULL);
+    retval = sc_io_source_read (src, NULL, (long)
+                                (file_offset + zpadding +
+                                 gfq[rank] * comb_size), NULL);
     SC_CHECK_ABORT (!retval, "seek data");
   }
 
diff --git a/src/p4est.h b/src/p4est.h
index 32cc000..530ba4b 100644
--- a/src/p4est.h
+++ b/src/p4est.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -64,12 +65,12 @@ SC_EXTERN_C_BEGIN;
 /** The 2D quadrant datatype */
 typedef struct p4est_quadrant
 {
-  /*@{*/
+  /*@{ */
   p4est_qcoord_t      x, y;  /**< coordinates */
-  /*@}*/
-  int8_t              level, /**< level of refinement */
-                      pad8;  /**< padding */
-  int16_t             pad16; /**< padding */
+  /*@} */
+  int8_t              level,    /**< level of refinement */
+                      pad8;     /**< padding */
+  int16_t             pad16;    /**< padding */
   union p4est_quadrant_data
   {
     void               *user_data;      /**< never changed by p4est */
@@ -131,11 +132,13 @@ typedef struct p4est
   sc_MPI_Comm         mpicomm;          /**< MPI communicator */
   int                 mpisize,          /**< number of MPI processes */
                       mpirank;          /**< this process's MPI rank */
+  int                 mpicomm_owned;    /**< flag if communicator is owned */
   size_t              data_size;        /**< size of per-quadrant p.user_data
                      (see p4est_quadrant_t::p4est_quadrant_data::user_data) */
   void               *user_pointer;     /**< convenience pointer for users,
                                              never touched by p4est */
 
+  long                revision;         /**< Gets bumped on mesh change */
   p4est_topidx_t      first_local_tree; /**< 0-based index of first local
                                              tree, must be -1 for an empty
                                              processor */
@@ -156,22 +159,33 @@ typedef struct p4est
   sc_array_t         *trees;          /**< array of all trees */
 
   sc_mempool_t       *user_data_pool; /**< memory allocator for user data */
-                                      /*   WARNING: This is NULL if data size
-                                                    equals zero. */
+  /*   WARNING: This is NULL if data size
+     equals zero. */
   sc_mempool_t       *quadrant_pool;  /**< memory allocator for temporary
                                            quadrants */
   p4est_inspect_t    *inspect;        /**< algorithmic switches */
 }
 p4est_t;
 
-/** Calculate memory usage of a forest structure.
+/** Calculate local memory usage of a forest structure.
+ * Not collective.  The memory used on the current rank is returned.
  * The connectivity structure is not counted since it is not owned;
  * use p4est_connectivity_memory_usage (p4est->connectivity).
- * \param [in] p4est    Forest structure.
+ * \param [in] p4est    Valid forest structure.
  * \return              Memory used in bytes.
  */
 size_t              p4est_memory_used (p4est_t * p4est);
 
+/** Return the revision counter of the forest.
+ * Not collective, even though the revision value is the same on all ranks.
+ * A newly created forest starts with a revision counter of zero.
+ * Every refine, coarsen, partition, and balance that actually changes the mesh
+ * increases the counter by one.  Operations with no effect keep the old value.
+ * \param [in] p8est    The forest must be valid.
+ * \return              Non-negative number.
+ */
+long                p4est_revision (p4est_t * p4est);
+
 /** Callback function prototype to initialize the quadrant's user data.
  * \param [in] p4est         the forest
  * \param [in] which_tree    the tree containing \a quadrant
@@ -225,7 +239,7 @@ extern void        *P4EST_DATA_UNINITIALIZED;
  * \param [in] connectivity     Connectivity must provide the vertices.
  * \param [in] treeid           Identify the tree that contains x, y.
  * \param [in] x, y             Quadrant coordinates relative to treeid.
- * \param [out] vxy             Transformed coordinates in vertex space.
+ * \param [out] vxyz            Transformed coordinates in vertex space.
  */
 void                p4est_qcoord_to_vertex (p4est_connectivity_t *
                                             connectivity,
@@ -269,10 +283,13 @@ void                p4est_destroy (p4est_t * p4est);
  * Copying of quadrant user data is optional.
  * If old and new data sizes are 0, the user_data field is copied regardless.
  * The inspect member of the copy is set to NULL.
+ * The revision counter of the copy is set to zero.
  *
  * \param [in]  copy_data  If true, data are copied.
  *                         If false, data_size is set to 0.
- * \return  Returns a valid p4est that does not depend on the input.
+ * \return  Returns a valid p4est that does not depend on the input,
+ *                         except for borrowing the same connectivity.
+ *                         Its revision counter is 0.
  */
 p4est_t            *p4est_copy (p4est_t * input, int copy_data);
 
@@ -371,6 +388,9 @@ unsigned            p4est_checksum (p4est_t * p4est);
  * header.  This makes the file depend on mpisize.  For changing this see
  * p4est_save_ext() in p4est_extended.h.
  *
+ * The revision counter is not saved to the file, since that would make files
+ * different that come from different revisions but store the same mesh.
+ *
  * \param [in] filename    Name of the file to write.
  * \param [in] p4est       Valid forest structure.
  * \param [in] save_data   If true, the element data is saved.
@@ -394,6 +414,8 @@ void                p4est_save (const char *filename, p4est_t * p4est,
  * that it was stored with.  The defaults can be changed with p4est_load_ext()
  * in p4est_extended.h.
  *
+ * The revision counter of the loaded p4est is set to zero.
+ *
  * \param [in] filename         Name of the file to read.
  * \param [in] mpicomm          A valid MPI communicator.
  * \param [in] data_size        Size of data for each quadrant which can be
diff --git a/src/p4est_algorithms.c b/src/p4est_algorithms.c
index 03ec870..e0ad381 100644
--- a/src/p4est_algorithms.c
+++ b/src/p4est_algorithms.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -438,21 +439,54 @@ p4est_is_valid (p4est_t * p4est)
   P4EST_QUADRANT_INIT (&nextlow);
   P4EST_QUADRANT_INIT (&s);
 
-#ifdef P4EST_ENABLE_DEBUG
+  /* we crash on NULL pointers */
+  P4EST_ASSERT (p4est != NULL && p4est->connectivity != NULL);
+
+  /* check parallel environment */
+  if (p4est_comm_parallel_env_is_null (p4est)) {
+    P4EST_NOTICE ("p4est invalid parallel environment");
+    failed = 1;
+    goto failtest;
+  }
+
+  /* make sure the revision counter is legitimate */
+  if (p4est->revision < 0) {
+    P4EST_NOTICE ("p4est invalid revision counter\n");
+    failed = 1;
+    goto failtest;
+  }
+
   /* check last item of global partition */
-  P4EST_ASSERT (p4est->global_first_position[num_procs].p.which_tree ==
-                p4est->connectivity->num_trees &&
-                p4est->global_first_position[num_procs].x == 0 &&
-                p4est->global_first_position[num_procs].y == 0);
+  if (!(p4est->global_first_position[num_procs].p.which_tree ==
+        p4est->connectivity->num_trees &&
+        p4est->global_first_position[num_procs].x == 0 &&
+        p4est->global_first_position[num_procs].y == 0
 #ifdef P4_TO_P8
-  P4EST_ASSERT (p4est->global_first_position[num_procs].z == 0);
+        && p4est->global_first_position[num_procs].z == 0
 #endif
-  P4EST_ASSERT (p4est->connectivity->num_trees ==
-                (p4est_topidx_t) p4est->trees->elem_count);
+      )) {
+    P4EST_NOTICE ("p4est invalid global first position");
+    failed = 1;
+    goto failtest;
+  }
+
+  /* tree count and quadrant first position level */
+  if (p4est->connectivity->num_trees !=
+      (p4est_topidx_t) p4est->trees->elem_count) {
+    P4EST_NOTICE ("p4est invalid tree count");
+    failed = 1;
+    goto failtest;
+  }
   for (i = 0; i <= num_procs; ++i) {
-    P4EST_ASSERT (p4est->global_first_position[i].level == P4EST_QMAXLEVEL);
+    if (p4est->global_first_position[i].level != P4EST_QMAXLEVEL) {
+      failed = 1;
+      break;
+    }
+  }
+  if (failed) {
+    P4EST_NOTICE ("p4est invalid global first position level");
+    goto failtest;
   }
-#endif /* P4EST_ENABLE_DEBUG */
 
   /* check first tree in global partition */
   if (first_tree < 0) {
@@ -599,23 +633,30 @@ p4est_is_valid (p4est_t * p4est)
     nquadrants = 0;
     for (i = 0; i <= P4EST_QMAXLEVEL; ++i) {
       perlevel = tree->quadrants_per_level[i];
-
-      P4EST_ASSERT (perlevel >= 0);
+      if (perlevel < 0) {
+        failed = 1;
+        break;
+      }
       nquadrants += perlevel;   /* same type */
       if (perlevel > 0) {
         maxlevel = i;
       }
     }
-    for (; i <= P4EST_MAXLEVEL; ++i) {
-      P4EST_ASSERT (tree->quadrants_per_level[i] == -1);
+    if (!failed) {
+      for (; i <= P4EST_MAXLEVEL; ++i) {
+        if (tree->quadrants_per_level[i] != -1) {
+          failed = 1;
+          break;
+        }
+      }
+      lquadrants += nquadrants; /* same type */
     }
-    lquadrants += nquadrants;   /* same type */
-
-    if (maxlevel != (int) tree->maxlevel) {
-      P4EST_NOTICE ("p4est invalid wrong maxlevel\n");
+    if (failed || maxlevel != (int) tree->maxlevel) {
+      P4EST_NOTICE ("p4est invalid tree level\n");
       failed = 1;
       goto failtest;
     }
+
     if (nquadrants != (p4est_locidx_t) tree->quadrants.elem_count) {
       P4EST_NOTICE ("p4est invalid tree quadrant count\n");
       failed = 1;
@@ -644,24 +685,24 @@ failtest:
 /* here come the heavyweight algorithms */
 #ifndef P4_TO_P8
 /* which face of the center quad touches this insul */
-const static int    insul_to_f[9] = { -1, 2, -1, 0, -1, 1, -1, 3, -1 };
+static const int    insul_to_f[9] = { -1, 2, -1, 0, -1, 1, -1, 3, -1 };
 
 /* which corner of the center quad touches this insul */
-const static int    insul_to_c[9] = { 0, -1, 1, -1, -1, -1, 2, -1, 3 };
+static const int    insul_to_c[9] = { 0, -1, 1, -1, -1, -1, 2, -1, 3 };
 #else
 /* which face of the center quad touches this insul */
 /* *INDENT-OFF* */
-const static int insul_to_f[27] =
+static const int insul_to_f[27] =
 {-1, -1, -1, -1, 4, -1, -1, -1, -1,
  -1, 2, -1, 0, -1, 1, -1, 3, -1,
  -1, -1, -1, -1, 5, -1, -1, -1, -1};
 /* which corner of the center quad touches this insul */
-const static int insul_to_c[27] =
+static const int insul_to_c[27] =
 {0, -1, 1, -1, -1, -1, 2, -1, 3,
  -1, -1, -1, -1, -1, -1, -1, -1, -1,
  4, -1, 5, -1, -1, -1, 6, -1, 7};
 /* which edge of the center quad touches this insul */
-const static int insul_to_e[27] =
+static const int insul_to_e[27] =
 {-1, 0, -1, 4, -1, 5, -1, 1, -1,
   8, -1, 9, -1, -1, -1, 10, -1, 11,
   -1, 2, -1, 6, -1, 7, -1, 3, -1};
@@ -1303,15 +1344,15 @@ p4est_complete_region (p4est_t * p4est,
     p4est_quadrant_children (&Afinest, c0, c1, c2, c3);
 #endif
 
-    sc_list_append (W, c0);
-    sc_list_append (W, c1);
-    sc_list_append (W, c2);
-    sc_list_append (W, c3);
+    (void) sc_list_append (W, c0);
+    (void) sc_list_append (W, c1);
+    (void) sc_list_append (W, c2);
+    (void) sc_list_append (W, c3);
 #ifdef P4_TO_P8
-    sc_list_append (W, c4);
-    sc_list_append (W, c5);
-    sc_list_append (W, c6);
-    sc_list_append (W, c7);
+    (void) sc_list_append (W, c4);
+    (void) sc_list_append (W, c5);
+    (void) sc_list_append (W, c6);
+    (void) sc_list_append (W, c7);
 #endif
 
     /* for each w in W */
@@ -1350,15 +1391,15 @@ p4est_complete_region (p4est_t * p4est,
 #endif
 
 #ifdef P4_TO_P8
-        sc_list_prepend (W, c7);
-        sc_list_prepend (W, c6);
-        sc_list_prepend (W, c5);
-        sc_list_prepend (W, c4);
-#endif
-        sc_list_prepend (W, c3);
-        sc_list_prepend (W, c2);
-        sc_list_prepend (W, c1);
-        sc_list_prepend (W, c0);
+        (void) sc_list_prepend (W, c7);
+        (void) sc_list_prepend (W, c6);
+        (void) sc_list_prepend (W, c5);
+        (void) sc_list_prepend (W, c4);
+#endif
+        (void) sc_list_prepend (W, c3);
+        (void) sc_list_prepend (W, c2);
+        (void) sc_list_prepend (W, c1);
+        (void) sc_list_prepend (W, c0);
       }
 
       /* W <- W - w */
@@ -1408,18 +1449,44 @@ p4est_quadrant_disjoint_parent (const void *a, const void *b)
   return 0;
 }
 
-/* kernel for balancing quadrants.
- * inlist: sorted linear array: every quadrant should be child_id == 0
- * dom: quadrant that is ancestor to all quadrants in \a in.
- * bound: balance type bound
- * qpool: quadrant mempool
- * list_alloc: sc_link_t mempool
- * out: output complete balance array
- * first_desc: optional first descendant
- * last_desct: optional last_descendant
- * count_in: count_already_inlist accumulator
- * count_out: count_already_outlist accumulator
- * count_an: count_ancestor_inlist_accumulator
+/** Complete/balance a region of an tree.
+ *
+ * \param [in] inlist             List of quadrants to consider: should be
+ *                                sorted and reduced, i.e., every quadrant
+ *                                should have child_id == 0.
+ * \param [in]     dom            Least common ancestor of all quadrants in
+ *                                \a inlist.
+ * \param [in]     bound          The number of quadrants in a neighborhood to
+ *                                consider when balancing.
+ *                                bound = 1 : just the quadrant itself, i.e.,
+ *                                            completion.
+ *                                bound = P4EST_DIM + 1 : face balance
+ *                                bound = 2**P4EST_DIM  : full balance
+ *                                bound = 2**P4EST_DIM - 1 : edge balance
+ * \param [in/out] qpool          quadrant pool for temporary quadrants
+ * \param [in/out] list_alloc     list mempool for hash tables
+ * \param [in/out] out            the sorted, complete, balance quadrants in
+ *                                the region will be appended to out
+ * \param [in]     first_desc     the first quadrant defining the start of the
+ *                                region.  if NULL, the region is understood
+ *                                to start with the first descendant of \a
+ *                                dom.
+ * \param [in]     last_desc      the last quadrant defining the start of the
+ *                                region.  if NULL, the region is understood
+ *                                to end with the last descendant of \a
+ *                                dom.
+ * \param [in/out] count_in       If not NULL, points to an accumulator for
+ *                                the number of times the balance algorithm
+ *                                tries to insert a quadrant that already
+ *                                exists
+ * \param [in/out] count_out      If not NULL, points to an accumulator for
+ *                                the number of times the balance algorithm
+ *                                tries to duplicate the insertion of a new
+ *                                quadrant
+ * \param [in/out] count_an       If not NULL, points to an accumulator for
+ *                                the number of times the balance algorithm
+ *                                tries to insert the ancestor of an existing
+ *                                quadrant
  */
 static void
 p4est_complete_or_balance_kernel (sc_array_t * inlist,
@@ -1496,7 +1563,7 @@ p4est_complete_or_balance_kernel (sc_array_t * inlist,
         /* add tempq to inlist */
         sc_array_resize (inlist, inlist->elem_count + 1);
         memmove (sc_array_index (inlist, si + 1), sc_array_index (inlist, si),
-                 incount - si);
+                 (incount - si) * inlist->elem_size);
         q = p4est_quadrant_array_index (inlist, si);
         *q = tempq;
         q->p.user_int = 0;
@@ -1532,7 +1599,8 @@ p4est_complete_or_balance_kernel (sc_array_t * inlist,
         sc_array_resize (inlist, inlist->elem_count + 1);
         if ((size_t) si < incount - 1) {
           memmove (sc_array_index (inlist, si + 2),
-                   sc_array_index (inlist, si + 1), incount - (si + 1));
+                   sc_array_index (inlist, si + 1),
+                   (incount - (si + 1)) * inlist->elem_size);
         }
         q = p4est_quadrant_array_index (inlist, si + 1);
         *q = tempp;
@@ -1544,7 +1612,7 @@ p4est_complete_or_balance_kernel (sc_array_t * inlist,
       /* add tempp to inlist */
       sc_array_resize (inlist, inlist->elem_count + 1);
       memmove (sc_array_index (inlist, 1), sc_array_index (inlist, 0),
-               incount);
+               incount * inlist->elem_size);
       q = p4est_quadrant_array_index (inlist, 0);
       *q = tempp;
       q->p.user_int = 0;
@@ -1554,237 +1622,240 @@ p4est_complete_or_balance_kernel (sc_array_t * inlist,
 
   P4EST_ASSERT (sc_array_is_sorted (inlist, p4est_quadrant_compare));
 
-  /* initialize temporary storage */
-  for (l = 0; l <= minlevel; ++l) {
-    /* we don't need a hash table for minlevel, because all minlevel
-     * quadrants will be created automatically by filling in gaps */
-    hash[l] = NULL;
-    memset (&outlist[l], -1, sizeof (sc_array_t));
-  }
-  for (; l < maxlevel; ++l) {
-    hash[l] = sc_hash_new (p4est_quadrant_hash_fn, p4est_quadrant_equal_fn,
-                           NULL, list_alloc);
-    sc_array_init (&outlist[l], sizeof (p4est_quadrant_t *));
-  }
-  for (; l <= P4EST_MAXLEVEL; ++l) {
-    /* we don't need a hash table for maxlevel because a quad only spawns
-     * larger quads */
-    hash[l] = NULL;
-    memset (&outlist[l], -1, sizeof (sc_array_t));
-  }
-  outlist[maxlevel].elem_count = 0;
-
-  /* walk through the input tree bottom-up */
-  ph = 0;
-  pid = -1;
-  qalloc = p4est_quadrant_mempool_alloc (qpool);
-  qalloc->p.user_int = 0;
-
-  /* we don't need to run for minlevel + 1, because all of the quads that
-   * would be created would be outside dom */
-  for (l = maxlevel; l > minlevel + 1; l--) {
-    ocount = outlist[l].elem_count;     /* ocount is not growing */
-    olist = &outlist[l - 1];
-    for (jz = 0; jz < incount + ocount; ++jz) {
-      if (jz < incount) {
-        q = p4est_quadrant_array_index (inlist, jz);
-        if ((int) q->level != l || (q->p.user_int & duplicate)) {
-          /* if a duplicate, don't run */
-          continue;
+  if (bound > 1) {
+    /* initialize temporary storage */
+    for (l = 0; l <= minlevel; ++l) {
+      /* we don't need a hash table for minlevel, because all minlevel
+       * quadrants will be created automatically by filling in gaps */
+      hash[l] = NULL;
+      memset (&outlist[l], -1, sizeof (sc_array_t));
+    }
+    for (; l < maxlevel; ++l) {
+      hash[l] = sc_hash_new (p4est_quadrant_hash_fn, p4est_quadrant_equal_fn,
+                             NULL, list_alloc);
+      sc_array_init (&outlist[l], sizeof (p4est_quadrant_t *));
+    }
+    for (; l <= P4EST_MAXLEVEL; ++l) {
+      /* we don't need a hash table for maxlevel because a quad only spawns
+       * larger quads */
+      hash[l] = NULL;
+      memset (&outlist[l], -1, sizeof (sc_array_t));
+    }
+    outlist[maxlevel].elem_count = 0;
+
+    /* walk through the input tree bottom-up */
+    ph = 0;
+    pid = -1;
+    qalloc = p4est_quadrant_mempool_alloc (qpool);
+    qalloc->p.user_int = 0;
+
+    /* we don't need to run for minlevel + 1, because all of the quads that
+     * would be created would be outside dom */
+    for (l = maxlevel; l > minlevel + 1; l--) {
+      ocount = outlist[l].elem_count;   /* ocount is not growing */
+      olist = &outlist[l - 1];
+      for (jz = 0; jz < incount + ocount; ++jz) {
+        if (jz < incount) {
+          q = p4est_quadrant_array_index (inlist, jz);
+          if ((int) q->level != l || (q->p.user_int & duplicate)) {
+            /* if a duplicate, don't run */
+            continue;
+          }
         }
-      }
-      else {
-        qpointer =
-          (p4est_quadrant_t **) sc_array_index (&outlist[l], jz - incount);
-        q = *qpointer;
-        P4EST_ASSERT ((int) q->level == l);
-      }
-      P4EST_ASSERT (p4est_quadrant_is_ancestor (dom, q));
-      P4EST_ASSERT (p4est_quadrant_child_id (q) == 0);
+        else {
+          qpointer =
+            (p4est_quadrant_t **) sc_array_index (&outlist[l], jz - incount);
+          q = *qpointer;
+          P4EST_ASSERT ((int) q->level == l);
+        }
+        P4EST_ASSERT (p4est_quadrant_is_ancestor (dom, q));
+        P4EST_ASSERT (p4est_quadrant_child_id (q) == 0);
 
-      p4est_quadrant_parent (q, &par);  /* get the parent */
-      ph = P4EST_QUADRANT_LEN (par.level - 1);  /* twice its size */
-      pid = p4est_quadrant_child_id (&par);     /* and position */
-      p4est_quadrant_sibling (&par, &par, 0);   /* now shift to 0 */
+        p4est_quadrant_parent (q, &par);        /* get the parent */
+        ph = P4EST_QUADRANT_LEN (par.level - 1);        /* twice its size */
+        pid = p4est_quadrant_child_id (&par);   /* and position */
+        p4est_quadrant_sibling (&par, &par, 0); /* now shift to 0 */
 
-      for (sid = 0; sid < bound; sid++) {
-        *qalloc = par;
-        if (!sid) {
-          qalloc->p.user_int = precluded;
-          P4EST_ASSERT (p4est_quadrant_is_ancestor (dom, qalloc));
-        }
-        else if (sid <= P4EST_DIM) {
-          /* include face neighbors */
-          switch (sid - 1) {
-          case 0:
-            qalloc->x += ((pid & 1) ? ph : -ph);
-            break;
-          case 1:
-            qalloc->y += ((pid & 2) ? ph : -ph);
-            break;
+        for (sid = 0; sid < bound; sid++) {
+          *qalloc = par;
+          if (!sid) {
+            qalloc->p.user_int = precluded;
+            P4EST_ASSERT (p4est_quadrant_is_ancestor (dom, qalloc));
+          }
+          else if (sid <= P4EST_DIM) {
+            /* include face neighbors */
+            switch (sid - 1) {
+            case 0:
+              qalloc->x += ((pid & 1) ? ph : -ph);
+              break;
+            case 1:
+              qalloc->y += ((pid & 2) ? ph : -ph);
+              break;
 #ifdef P4_TO_P8
-          case 2:
-            qalloc->z += ((pid & 4) ? ph : -ph);
-            break;
+            case 2:
+              qalloc->z += ((pid & 4) ? ph : -ph);
+              break;
 #endif
-          default:
-            SC_ABORT_NOT_REACHED ();
+            default:
+              SC_ABORT_NOT_REACHED ();
+            }
           }
-        }
 #ifdef P4_TO_P8
-        else if (sid < 7) {
-          /* include edge neighbors */
-          switch (sid - 4) {
-          case 0:
-            qalloc->y += ((pid & 2) ? ph : -ph);
-            qalloc->z += ((pid & 4) ? ph : -ph);
-            break;
-          case 1:
-            qalloc->x += ((pid & 1) ? ph : -ph);
-            qalloc->z += ((pid & 4) ? ph : -ph);
-            break;
-          case 2:
-            qalloc->x += ((pid & 1) ? ph : -ph);
-            qalloc->y += ((pid & 2) ? ph : -ph);
-            break;
-          default:
-            SC_ABORT_NOT_REACHED ();
+          else if (sid < 7) {
+            /* include edge neighbors */
+            switch (sid - 4) {
+            case 0:
+              qalloc->y += ((pid & 2) ? ph : -ph);
+              qalloc->z += ((pid & 4) ? ph : -ph);
+              break;
+            case 1:
+              qalloc->x += ((pid & 1) ? ph : -ph);
+              qalloc->z += ((pid & 4) ? ph : -ph);
+              break;
+            case 2:
+              qalloc->x += ((pid & 1) ? ph : -ph);
+              qalloc->y += ((pid & 2) ? ph : -ph);
+              break;
+            default:
+              SC_ABORT_NOT_REACHED ();
+            }
           }
-        }
 #endif
-        else {
-          /* include corner neighbor */
-          qalloc->x += ((pid & 1) ? ph : -ph);
-          qalloc->y += ((pid & 2) ? ph : -ph);
+          else {
+            /* include corner neighbor */
+            qalloc->x += ((pid & 1) ? ph : -ph);
+            qalloc->y += ((pid & 2) ? ph : -ph);
 #ifdef P4_TO_P8
-          qalloc->z += ((pid & 4) ? ph : -ph);
+            qalloc->z += ((pid & 4) ? ph : -ph);
 #endif
-        }
+          }
 
-        P4EST_ASSERT (p4est_quadrant_is_extended (qalloc));
-        P4EST_ASSERT (p4est_quadrant_child_id (qalloc) == 0);
-        P4EST_ASSERT (!sid || qalloc->p.user_int == 0);
-        P4EST_ASSERT (qalloc->level == l - 1);
+          P4EST_ASSERT (p4est_quadrant_is_extended (qalloc));
+          P4EST_ASSERT (p4est_quadrant_child_id (qalloc) == 0);
+          P4EST_ASSERT (!sid || qalloc->p.user_int == 0);
+          P4EST_ASSERT (qalloc->level == l - 1);
 
-        /* do not add quadrants outside of the domain */
-        if (sid && !p4est_quadrant_is_ancestor (dom, qalloc)) {
-          continue;
-        }
+          /* do not add quadrants outside of the domain */
+          if (sid && !p4est_quadrant_is_ancestor (dom, qalloc)) {
+            continue;
+          }
 
-        /* make sure that qalloc is not included more than once */
-        inserted = sc_hash_insert_unique (hash[l - 1], qalloc, &vlookup);
-        if (!inserted) {
-          /* qalloc is already included in output list, this catches most */
-          ++count_already_outlist;
-          if (!sid) {
-            /* we need to relay the fact that this octant is precluded */
-            qlookup = (p4est_quadrant_t *) * vlookup;
-            qlookup->p.user_int = precluded;
+          /* make sure that qalloc is not included more than once */
+          inserted = sc_hash_insert_unique (hash[l - 1], qalloc, &vlookup);
+          if (!inserted) {
+            /* qalloc is already included in output list, this catches most */
+            ++count_already_outlist;
+            if (!sid) {
+              /* we need to relay the fact that this octant is precluded */
+              qlookup = (p4est_quadrant_t *) * vlookup;
+              qlookup->p.user_int = precluded;
+            }
+            continue;
           }
-          continue;
-        }
 
-        if (sid) {
-          /* we do not need to search if we are adding the parent sibling: we
-           * already know that the octant is precluded, and any other octant
-           * we might find should already be marked duplicate */
-          srindex = sc_array_bsearch (inlist, qalloc,
-                                      p4est_quadrant_disjoint_parent);
-
-          if (srindex != -1) {
-            r = p4est_quadrant_array_index (inlist, srindex);
-
-            if (r->level >= l - 1) {
-              /* either qalloc duplicates r or is precluded by r: either way,
-               * we do not need to add qalloc to inlist in the final merge */
-              qalloc->p.user_int = precluded;
-              if (r->level > l - 1) {
-                ++count_ancestor_inlist;
-              }
-              else {
-                ++count_already_inlist;
+          if (sid) {
+            /* we do not need to search if we are adding the parent sibling: we
+             * already know that the octant is precluded, and any other octant
+             * we might find should already be marked duplicate */
+            srindex = sc_array_bsearch (inlist, qalloc,
+                                        p4est_quadrant_disjoint_parent);
+
+            if (srindex != -1) {
+              r = p4est_quadrant_array_index (inlist, srindex);
+
+              if (r->level >= l - 1) {
+                /* either qalloc duplicates r or is precluded by r: either way,
+                 * we do not need to add qalloc to inlist in the final merge */
+                qalloc->p.user_int = precluded;
+                if (r->level > l - 1) {
+                  ++count_ancestor_inlist;
+                }
+                else {
+                  ++count_already_inlist;
+                }
               }
-            }
-            if (r->level <= l - 1) {
-              /* either qalloc duplicates r, or an octant that can be traced to
-               * qalloc will duplicate r */
-              r->p.user_int |= duplicate;
-              if (r->level < l - 1) {
-                /* if qalloc precluded r, we can remove r before the final
-                 * merge */
-                r->p.user_int |= precluded;
+              if (r->level <= l - 1) {
+                /* either qalloc duplicates r, or an octant that can be traced to
+                 * qalloc will duplicate r */
+                r->p.user_int |= duplicate;
+                if (r->level < l - 1) {
+                  /* if qalloc precluded r, we can remove r before the final
+                   * merge */
+                  r->p.user_int |= precluded;
+                }
               }
             }
           }
-        }
 
-        qpointer = (p4est_quadrant_t **) sc_array_push (olist);
-        *qpointer = qalloc;
-        /* we need a new quadrant now, the old one is stored away */
-        qalloc = p4est_quadrant_mempool_alloc (qpool);
-        qalloc->p.user_int = 0;
+          qpointer = (p4est_quadrant_t **) sc_array_push (olist);
+          *qpointer = qalloc;
+          /* we need a new quadrant now, the old one is stored away */
+          qalloc = p4est_quadrant_mempool_alloc (qpool);
+          qalloc->p.user_int = 0;
+        }
       }
     }
-  }
-  sc_mempool_free (qpool, qalloc);
+    sc_mempool_free (qpool, qalloc);
 
-  /* remove unneeded octants */
-  jz = 0;
-  for (iz = 0; iz < incount; iz++) {
-    q = p4est_quadrant_array_index (inlist, iz);
-    if ((q->p.user_int & precluded) == 0) {
-      if (jz != iz) {
-        p = p4est_quadrant_array_index (inlist, jz++);
-        *p = *q;
-      }
-      else {
-        jz++;
+    /* remove unneeded octants */
+    jz = 0;
+    for (iz = 0; iz < incount; iz++) {
+      q = p4est_quadrant_array_index (inlist, iz);
+      if ((q->p.user_int & precluded) == 0) {
+        if (jz != iz) {
+          p = p4est_quadrant_array_index (inlist, jz++);
+          *p = *q;
+        }
+        else {
+          jz++;
+        }
       }
     }
-  }
-  sc_array_resize (inlist, jz);
-  incount = jz;
+    sc_array_resize (inlist, jz);
+    incount = jz;
 
-  for (l = minlevel + 1; l < maxlevel; ++l) {
-    /* print statistics and free hash tables */
+    for (l = minlevel + 1; l < maxlevel; ++l) {
+      /* print statistics and free hash tables */
 #ifdef P4EST_ENABLE_DEBUG
-    sc_hash_print_statistics (p4est_package_id, SC_LP_DEBUG, hash[l]);
+      sc_hash_print_statistics (p4est_package_id, SC_LP_DEBUG, hash[l]);
 #endif
-    sc_hash_unlink_destroy (hash[l]);
+      sc_hash_unlink_destroy (hash[l]);
 
-    /* merge valid quadrants from outlist into inlist */
-    ocount = outlist[l].elem_count;
-    q = NULL;
-    for (jz = 0; jz < ocount; ++jz) {
-      /* go through output list */
-      qpointer = (p4est_quadrant_t **) sc_array_index (&outlist[l], jz);
-      qalloc = *qpointer;
-      P4EST_ASSERT ((int) qalloc->level == l);
-      P4EST_ASSERT (p4est_quadrant_is_ancestor (dom, qalloc));
-      P4EST_ASSERT (p4est_quadrant_child_id (qalloc) == 0);
-      /* copy temporary quadrant into inlist */
-      if (first_desc != NULL && p4est_quadrant_compare (qalloc, &fd) < 0) {
-        sc_mempool_free (qpool, qalloc);
-        continue;
-      }
-      if (last_desc != NULL && p4est_quadrant_compare (qalloc, last_desc) > 0) {
+      /* merge valid quadrants from outlist into inlist */
+      ocount = outlist[l].elem_count;
+      q = NULL;
+      for (jz = 0; jz < ocount; ++jz) {
+        /* go through output list */
+        qpointer = (p4est_quadrant_t **) sc_array_index (&outlist[l], jz);
+        qalloc = *qpointer;
+        P4EST_ASSERT ((int) qalloc->level == l);
+        P4EST_ASSERT (p4est_quadrant_is_ancestor (dom, qalloc));
+        P4EST_ASSERT (p4est_quadrant_child_id (qalloc) == 0);
+        /* copy temporary quadrant into inlist */
+        if (first_desc != NULL && p4est_quadrant_compare (qalloc, &fd) < 0) {
+          sc_mempool_free (qpool, qalloc);
+          continue;
+        }
+        if (last_desc != NULL
+            && p4est_quadrant_compare (qalloc, last_desc) > 0) {
+          sc_mempool_free (qpool, qalloc);
+          continue;
+        }
+        if (qalloc->p.user_int != precluded) {
+          q = p4est_quadrant_array_push (inlist);
+          *q = *qalloc;
+        }
         sc_mempool_free (qpool, qalloc);
-        continue;
       }
-      if (qalloc->p.user_int != precluded) {
-        q = p4est_quadrant_array_push (inlist);
-        *q = *qalloc;
-      }
-      sc_mempool_free (qpool, qalloc);
+      sc_array_reset (&outlist[l]);
     }
-    sc_array_reset (&outlist[l]);
-  }
-  P4EST_ASSERT (quadrant_pool_size == qpool->elem_count);
-  sc_mempool_truncate (list_alloc);
+    P4EST_ASSERT (quadrant_pool_size == qpool->elem_count);
+    sc_mempool_truncate (list_alloc);
 
-  /* sort inlist */
-  if (inlist->elem_count > incount) {
-    sc_array_sort (inlist, p4est_quadrant_compare);
+    /* sort inlist */
+    if (inlist->elem_count > incount) {
+      sc_array_sort (inlist, p4est_quadrant_compare);
+    }
   }
 
   /* step through inlist and fill in the gaps in out */
@@ -2001,6 +2072,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
   p4est_tree_t       *tree;
   sc_array_t         *tquadrants;
   int                 bound;
+  int8_t              maxlevel;
   sc_mempool_t       *qpool;
 #ifdef P4EST_ENABLE_DEBUG
   size_t              data_pool_size;
@@ -2088,7 +2160,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
     p4est_nearest_common_ancestor (p, q, &tempq);
     if (tempq.level >= SC_MIN (q->level, p->level) - 1) {
       if (p->level > q->level) {
-        *q = *p;
+        p4est_quadrant_sibling (p, q, 0);
       }
       continue;
     }
@@ -2109,6 +2181,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
 
   iz = 0;                       /* tquadrants */
   jz = 0;                       /* outlist */
+  maxlevel = tree->maxlevel;
 
   /* initialize quadrants in outlist */
   while (iz < tcount && jz < ocount) {
@@ -2118,6 +2191,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
     /* watch out for gaps in tquadrants */
     while (p4est_quadrant_compare (p, q) < 0) {
       P4EST_ASSERT (!p4est_quadrant_is_ancestor (p, q));
+      maxlevel = SC_MAX (maxlevel, p->level);
       ++tree->quadrants_per_level[p->level];
       p4est_quadrant_init_data (p4est, which_tree, p, init_fn);
       jz++;
@@ -2138,6 +2212,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
         jzstart = jz;
       }
       while (jz < ocount && p4est_quadrant_is_ancestor (q, p)) {
+        maxlevel = SC_MAX (maxlevel, p->level);
         ++tree->quadrants_per_level[p->level];
         p4est_quadrant_init_data (p4est, which_tree, p, init_fn);
         if (++jz < ocount) {
@@ -2165,6 +2240,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
   /* initialize new quadrants after last tquadrant */
   for (; jz < ocount; jz++) {
     p = p4est_quadrant_array_index (outlist, jz);
+    maxlevel = SC_MAX (maxlevel, p->level);
     ++tree->quadrants_per_level[p->level];
     p4est_quadrant_init_data (p4est, which_tree, p, init_fn);
   }
@@ -2172,6 +2248,7 @@ p4est_complete_or_balance (p4est_t * p4est, p4est_topidx_t which_tree,
   /* resize tquadrants and copy */
   sc_array_resize (tquadrants, ocount);
   memcpy (tquadrants->array, outlist->array, outlist->elem_size * ocount);
+  tree->maxlevel = maxlevel;
 
   /* sanity check */
   if (p4est->user_data_pool != NULL) {
diff --git a/src/p4est_algorithms.h b/src/p4est_algorithms.h
index aed1401..0ac750a 100644
--- a/src/p4est_algorithms.h
+++ b/src/p4est_algorithms.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -111,7 +112,10 @@ int                 p4est_is_equal (p4est_t * p4est1, p4est_t * p4est2,
  *    the quadrant counters are consistent
  *    all trees are complete
  *    all non-local trees are empty
+ * This function is collective!
+ * It is also relatively expensive, so its use in production should be limited.
  * \param [in] p4est    The forest to be tested.
+ *                      Itself and its connectivity must be non-NULL.
  * \return              Returns true if valid, false otherwise.
  */
 int                 p4est_is_valid (p4est_t * p4est);
@@ -185,6 +189,9 @@ void                p4est_complete_region (p4est_t * p4est,
 
 /** Completes a sorted tree within a p4est. It may have exterior quadrants.
  * The completed tree will have only owned quadrants and no overlap.
+ * Note that the tree's counters (\a quadrants_per_level, \a maxlevel) must be
+ * correct for the quadrants in the incoming tree.
+ *
  * \param [in,out] p4est      The p4est to work on.
  * \param [in]     which_tree The 0-based index of the subtree to complete.
  * \param [in]     init_fn    Callback function to initialize the user_data
diff --git a/src/p4est_balance.c b/src/p4est_balance.c
index 9ecd50c..99bc7c8 100644
--- a/src/p4est_balance.c
+++ b/src/p4est_balance.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_balance.h b/src/p4est_balance.h
index ce92035..1160033 100644
--- a/src/p4est_balance.h
+++ b/src/p4est_balance.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_base.c b/src/p4est_base.c
index e414f9c..87089d6 100644
--- a/src/p4est_base.c
+++ b/src/p4est_base.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_base.h b/src/p4est_base.h
index 636a7f3..37c6e5b 100644
--- a/src/p4est_base.h
+++ b/src/p4est_base.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -58,6 +59,7 @@ typedef int32_t     p4est_qcoord_t;
 #define P4EST_QCOORD_MIN INT32_MIN
 #define P4EST_QCOORD_MAX INT32_MAX
 #define P4EST_QCOORD_1   ((p4est_qcoord_t) 1)
+#define P4EST_QCOORD_ABS(x) ((p4est_qcoord_t) labs ((long) (x)))
 
 /** Typedef for counting topological entities (trees, tree vertices). */
 typedef int32_t     p4est_topidx_t;
@@ -69,6 +71,7 @@ typedef int32_t     p4est_topidx_t;
 #define P4EST_TOPIDX_MAX INT32_MAX
 #define P4EST_TOPIDX_FITS_32 1
 #define P4EST_TOPIDX_1   ((p4est_topidx_t) 1)
+#define P4EST_TOPIDX_ABS(x) ((p4est_topidx_t) labs ((long) (x)))
 
 /** Typedef for processor-local indexing of quadrants and nodes. */
 typedef int32_t     p4est_locidx_t;
@@ -79,6 +82,7 @@ typedef int32_t     p4est_locidx_t;
 #define P4EST_LOCIDX_MIN INT32_MIN
 #define P4EST_LOCIDX_MAX INT32_MAX
 #define P4EST_LOCIDX_1   ((p4est_locidx_t) 1)
+#define P4EST_LOCIDX_ABS(x) ((p4est_locidx_t) labs ((long) (x)))
 
 /** Typedef for globally unique indexing of quadrants. */
 typedef int64_t     p4est_gloidx_t;
@@ -89,6 +93,39 @@ typedef int64_t     p4est_gloidx_t;
 #define P4EST_GLOIDX_MIN INT64_MIN
 #define P4EST_GLOIDX_MAX INT64_MAX
 #define P4EST_GLOIDX_1   ((p4est_gloidx_t) 1)
+#define P4EST_GLOIDX_ABS(x) ((p4est_gloidx_t) llabs ((long long) (x)))
+
+/** Tags for MPI messages */
+typedef enum p4est_comm_tag
+{
+  P4EST_COMM_TAG_FIRST = SC_TAG_FIRST,
+  P4EST_COMM_COUNT_PERTREE = SC_TAG_LAST,
+  P4EST_COMM_BALANCE_FIRST_COUNT,
+  P4EST_COMM_BALANCE_FIRST_LOAD,
+  P4EST_COMM_BALANCE_SECOND_COUNT,
+  P4EST_COMM_BALANCE_SECOND_LOAD,
+  P4EST_COMM_PARTITION_GIVEN,
+  P4EST_COMM_PARTITION_WEIGHTED_LOW,
+  P4EST_COMM_PARTITION_WEIGHTED_HIGH,
+  P4EST_COMM_PARTITION_CORRECTION,
+  P4EST_COMM_GHOST_COUNT,
+  P4EST_COMM_GHOST_LOAD,
+  P4EST_COMM_GHOST_EXCHANGE,
+  P4EST_COMM_GHOST_EXPAND_COUNT,
+  P4EST_COMM_GHOST_EXPAND_LOAD,
+  P4EST_COMM_GHOST_SUPPORT_COUNT,
+  P4EST_COMM_GHOST_SUPPORT_LOAD,
+  P4EST_COMM_GHOST_CHECKSUM,
+  P4EST_COMM_NODES_QUERY,
+  P4EST_COMM_NODES_REPLY,
+  P4EST_COMM_SAVE,
+  P4EST_COMM_LNODES_TEST,
+  P4EST_COMM_LNODES_PASS,
+  P4EST_COMM_LNODES_OWNED,
+  P4EST_COMM_LNODES_ALL,
+  P4EST_COMM_TAG_LAST
+}
+p4est_comm_tag_t;
 
 /* some error checking possibly specific to p4est */
 #ifdef P4EST_ENABLE_DEBUG
@@ -101,12 +138,15 @@ typedef int64_t     p4est_gloidx_t;
   do { int _p4est_i = (int) (expression);                               \
        SC_CHECK_ABORT (_p4est_i, "Expected true: '" #expression "'");   \
   } while (0)
+#define P4EST_DEBUG_EXECUTE(expression)                 \
+  do { (void) (expression); } while (0)
 #else
 #define P4EST_ASSERT(c) SC_NOOP ()
-#define P4EST_EXECUTE_ASSERT_FALSE(expression) \
+#define P4EST_EXECUTE_ASSERT_FALSE(expression)          \
   do { (void) (expression); } while (0)
-#define P4EST_EXECUTE_ASSERT_TRUE(expression) \
+#define P4EST_EXECUTE_ASSERT_TRUE(expression)           \
   do { (void) (expression); } while (0)
+#define P4EST_DEBUG_EXECUTE(expression) SC_NOOP ()
 #endif
 
 /* macros for memory allocation, will abort if out of memory */
diff --git a/src/p4est_bits.c b/src/p4est_bits.c
index da398e8..8ab7916 100644
--- a/src/p4est_bits.c
+++ b/src/p4est_bits.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -760,7 +761,11 @@ p4est_quadrant_is_inside_tree (p4est_tree_t * tree,
     return 0;
 
   /* check if the end of q is not after the last tree quadrant */
+  /* tree->last_desc is an upper right corner quadrant by construction.
+   * It is ok to compare with q. */
+#if 0
   p4est_quadrant_last_descendant (q, &desc, P4EST_QMAXLEVEL);
+#endif
   if (p4est_quadrant_compare (&tree->last_desc, q) < 0)
     return 0;
 
@@ -823,6 +828,25 @@ p4est_quadrant_sibling (const p4est_quadrant_t * q, p4est_quadrant_t * r,
 }
 
 void
+p4est_quadrant_child (const p4est_quadrant_t * q, p4est_quadrant_t * r,
+                      int child_id)
+{
+  const p4est_qcoord_t shift = P4EST_QUADRANT_LEN (q->level + 1);
+
+  P4EST_ASSERT (p4est_quadrant_is_extended (q));
+  P4EST_ASSERT (q->level < P4EST_QMAXLEVEL);
+  P4EST_ASSERT (child_id >= 0 && child_id < P4EST_CHILDREN);
+
+  r->x = child_id & 0x01 ? (q->x | shift) : q->x;
+  r->y = child_id & 0x02 ? (q->y | shift) : q->y;
+#ifdef P4_TO_P8
+  r->z = child_id & 0x04 ? (q->z | shift) : q->z;
+#endif
+  r->level = q->level + 1;
+  P4EST_ASSERT (p4est_quadrant_is_parent (q, r));
+}
+
+void
 p4est_quadrant_face_neighbor (const p4est_quadrant_t * q,
                               int face, p4est_quadrant_t * r)
 {
diff --git a/src/p4est_bits.h b/src/p4est_bits.h
index f84ce7a..ea1923a 100644
--- a/src/p4est_bits.h
+++ b/src/p4est_bits.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -331,6 +332,15 @@ void                p4est_quadrant_sibling (const p4est_quadrant_t * q,
                                             p4est_quadrant_t * r,
                                             int sibling_id);
 
+/** Compute a specific child of a quadrant.
+ * \param [in]     q    Input quadrant.
+ * \param [in,out] r    Existing quadrant whose Morton index will be filled
+ *                      with the coordinates of its child no. \b child_id.
+ * \param [in] child_id The id of the child computed, 0..3.
+ */
+void                p4est_quadrant_child (const p4est_quadrant_t * q,
+                                          p4est_quadrant_t * r, int child_id);
+
 /** Compute the face neighbor of a quadrant.
  * \param [in]     q      Input quadrant, must be valid.
  * \param [in]     face   The face across which to generate the neighbor.
diff --git a/src/p4est_communication.c b/src/p4est_communication.c
index e184d18..32f2714 100644
--- a/src/p4est_communication.c
+++ b/src/p4est_communication.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -22,9 +23,11 @@
 */
 
 #ifdef P4_TO_P8
+#include <p8est_algorithms.h>
 #include <p8est_communication.h>
 #include <p8est_bits.h>
 #else
+#include <p4est_algorithms.h>
 #include <p4est_communication.h>
 #include <p4est_bits.h>
 #endif /* !P4_TO_P8 */
@@ -33,6 +36,272 @@
 #endif
 
 void
+p4est_comm_parallel_env_assign (p4est_t * p4est, sc_MPI_Comm mpicomm)
+{
+  /* set MPI communicator */
+  p4est->mpicomm = mpicomm;
+  p4est->mpicomm_owned = 0;
+
+  /* retrieve MPI information */
+  p4est_comm_parallel_env_get_info (p4est);
+}
+
+void
+p4est_comm_parallel_env_duplicate (p4est_t * p4est)
+{
+  sc_MPI_Comm         mpicomm = p4est->mpicomm;
+  int                 mpiret;
+
+  /* duplicate MPI communicator */
+  mpiret = sc_MPI_Comm_dup (mpicomm, &(p4est->mpicomm));
+  SC_CHECK_MPI (mpiret);
+  p4est->mpicomm_owned = 1;
+}
+
+void
+p4est_comm_parallel_env_release (p4est_t * p4est)
+{
+  int                 mpiret;
+
+  /* free MPI communicator if it's owned */
+  if (p4est->mpicomm_owned) {
+    mpiret = sc_MPI_Comm_free (&(p4est->mpicomm));
+    SC_CHECK_MPI (mpiret);
+  }
+  p4est->mpicomm = sc_MPI_COMM_NULL;
+  p4est->mpicomm_owned = 0;
+
+  /* set MPI information */
+  p4est->mpisize = 0;
+  p4est->mpirank = sc_MPI_UNDEFINED;
+}
+
+void
+p4est_comm_parallel_env_replace (p4est_t * p4est, sc_MPI_Comm mpicomm)
+{
+  /* check if input MPI communicator has same size and same rank order */
+#ifdef P4EST_ENABLE_DEBUG
+  {
+    int                 mpiret, result;
+
+    mpiret = sc_MPI_Comm_compare (p4est->mpicomm, mpicomm, &result);
+    SC_CHECK_MPI (mpiret);
+
+    P4EST_ASSERT (result == sc_MPI_IDENT || result == sc_MPI_CONGRUENT);
+  }
+#endif
+
+  /* release the current parallel environment */
+  p4est_comm_parallel_env_release (p4est);
+
+  /* assign new MPI communicator */
+  p4est_comm_parallel_env_assign (p4est, mpicomm);
+}
+
+void
+p4est_comm_parallel_env_get_info (p4est_t * p4est)
+{
+  int                 mpiret;
+
+  mpiret = sc_MPI_Comm_size (p4est->mpicomm, &(p4est->mpisize));
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (p4est->mpicomm, &(p4est->mpirank));
+  SC_CHECK_MPI (mpiret);
+}
+
+int
+p4est_comm_parallel_env_is_null (p4est_t * p4est)
+{
+  return (p4est->mpicomm == sc_MPI_COMM_NULL);
+}
+
+int
+p4est_comm_parallel_env_reduce (p4est_t ** p4est_supercomm)
+{
+  return p4est_comm_parallel_env_reduce_ext (p4est_supercomm,
+                                             sc_MPI_GROUP_NULL, 0, NULL);
+}
+
+int
+p4est_comm_parallel_env_reduce_ext (p4est_t ** p4est_supercomm,
+                                    sc_MPI_Group group_add,
+                                    int add_to_beginning, int **ranks_subcomm)
+{
+  const char         *this_fn_name = "comm_parallel_env_reduce";
+  p4est_t            *p4est = *p4est_supercomm;
+  sc_MPI_Comm         mpicomm = p4est->mpicomm;
+  int                 mpisize = p4est->mpisize;
+  int                 mpiret;
+  p4est_gloidx_t     *global_first_quadrant = p4est->global_first_quadrant;
+  p4est_quadrant_t   *global_first_position = p4est->global_first_position;
+
+  p4est_gloidx_t     *n_quadrants;
+  int                *include;
+  sc_MPI_Group        group, subgroup;
+  sc_MPI_Comm         submpicomm;
+  int                 submpisize, submpirank;
+  int                *ranks, *subranks;
+  int                 p;
+
+  /* exit if MPI communicator cannot be reduced */
+  if (mpisize == 1) {
+    return 1;
+  }
+
+  /* create array of non-empty processes that will be included to sub-comm */
+  n_quadrants = P4EST_ALLOC (p4est_gloidx_t, mpisize);
+  include = P4EST_ALLOC (int, mpisize);
+  submpisize = 0;
+  for (p = 0; p < mpisize; p++) {
+    n_quadrants[p] = global_first_quadrant[p + 1] - global_first_quadrant[p];
+    if (global_first_quadrant[p] < global_first_quadrant[p + 1]) {
+      include[submpisize++] = p;
+    }
+  }
+
+  /* exit if reduction not possible */
+  if (submpisize == mpisize) {
+    P4EST_FREE (n_quadrants);
+    P4EST_FREE (include);
+    return 1;
+  }
+
+  /* create sub-group of non-empty processors */
+  mpiret = sc_MPI_Comm_group (mpicomm, &group);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Group_incl (group, submpisize, include, &subgroup);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Group_free (&group);
+  SC_CHECK_MPI (mpiret);
+  P4EST_FREE (include);
+
+  /* create sub-communicator */
+  if (group_add != sc_MPI_GROUP_NULL) {
+    sc_MPI_Group        group_union;
+
+    /* create union with optional group */
+    if (add_to_beginning) {
+      mpiret = sc_MPI_Group_union (group_add, subgroup, &group_union);
+    }
+    else {
+      mpiret = sc_MPI_Group_union (subgroup, group_add, &group_union);
+    }
+    SC_CHECK_MPI (mpiret);
+
+    /* create sub-communicator */
+    mpiret = sc_MPI_Comm_create (mpicomm, group_union, &submpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Group_free (&group_union);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Group_free (&subgroup);
+    SC_CHECK_MPI (mpiret);
+  }
+  else {
+    /* create sub-communicator */
+    mpiret = sc_MPI_Comm_create (mpicomm, subgroup, &submpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Group_free (&subgroup);
+    SC_CHECK_MPI (mpiret);
+  }
+
+  /* destroy p4est and exit if this rank is empty */
+  if (submpicomm == sc_MPI_COMM_NULL) {
+    /* destroy */
+    P4EST_FREE (n_quadrants);
+    p4est_destroy (p4est);
+    *p4est_supercomm = NULL;
+    if (ranks_subcomm) {
+      *ranks_subcomm = NULL;
+    }
+
+    /* return that p4est does not exist on this rank */
+    return 0;
+  }
+
+  /* update parallel environment */
+  mpiret = sc_MPI_Comm_size (submpicomm, &submpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (submpicomm, &submpirank);
+  SC_CHECK_MPI (mpiret);
+
+  if (submpirank == 0) {
+    P4EST_VERBOSEF ("%s: Reduce MPI communicator from %i to %i\n",
+                    this_fn_name, mpisize, submpisize);
+    /* TODO: There should be a function for printing to stdout that works with
+     *       sub-communicators. */
+  }
+
+  /* translate MPI ranks */
+  ranks = P4EST_ALLOC (int, submpisize);
+  subranks = P4EST_ALLOC (int, submpisize);
+  for (p = 0; p < submpisize; p++) {
+    subranks[p] = p;
+  }
+  mpiret = sc_MPI_Comm_group (submpicomm, &subgroup);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_group (mpicomm, &group);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Group_translate_ranks (subgroup, submpisize, subranks,
+                                         group, ranks);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Group_free (&subgroup);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Group_free (&group);
+  SC_CHECK_MPI (mpiret);
+  P4EST_FREE (subranks);
+
+  /* allocate and set global quadrant count */
+  P4EST_FREE (p4est->global_first_quadrant);
+  p4est->global_first_quadrant = P4EST_ALLOC (p4est_gloidx_t, submpisize + 1);
+  p4est->global_first_quadrant[0] = 0;
+  for (p = 0; p < submpisize; p++) {
+    P4EST_ASSERT (ranks[p] != sc_MPI_UNDEFINED);
+    P4EST_ASSERT (group_add != sc_MPI_GROUP_NULL
+                  || 0 < n_quadrants[ranks[p]]);
+    p4est->global_first_quadrant[p + 1] =
+      p4est->global_first_quadrant[p] + n_quadrants[ranks[p]];
+  }
+  P4EST_ASSERT (p4est->global_first_quadrant[submpisize] =
+                p4est->global_num_quadrants);
+  P4EST_FREE (n_quadrants);
+
+  /* set new parallel environment */
+  p4est_comm_parallel_env_release (p4est);
+  p4est_comm_parallel_env_assign (p4est, submpicomm);
+  p4est_comm_parallel_env_duplicate (p4est);
+  mpiret = sc_MPI_Comm_free (&submpicomm);
+  SC_CHECK_MPI (mpiret);
+  P4EST_ASSERT (p4est->mpisize == submpisize);
+
+  /* allocate and set partition information */
+  p4est->global_first_position =
+    P4EST_ALLOC (p4est_quadrant_t, submpisize + 1);
+  if (group_add != sc_MPI_GROUP_NULL) { /* if communication is required */
+    p4est_comm_global_partition (p4est, NULL);
+  }
+  else {                        /* if we can set partition information communication-free */
+    for (p = 0; p < submpisize; p++) {
+      P4EST_ASSERT (0 == p || ranks[p - 1] < ranks[p]);
+      p4est->global_first_position[p] = global_first_position[ranks[p]];
+    }
+    p4est->global_first_position[submpisize] = global_first_position[mpisize];
+  }
+  P4EST_FREE (global_first_position);
+  if (ranks_subcomm) {
+    *ranks_subcomm = ranks;
+  }
+  else {
+    P4EST_FREE (ranks);
+  }
+
+  /* check for valid p4est */
+  P4EST_ASSERT (p4est_is_valid (p4est));
+
+  /* return that p4est exists on this rank */
+  return 1;
+}
+
+void
 p4est_comm_count_quadrants (p4est_t * p4est)
 {
   int                 mpiret;
@@ -292,6 +561,20 @@ p4est_comm_count_pertree (p4est_t * p4est, p4est_gloidx_t * pertree)
 }
 
 int
+p4est_comm_is_empty (p4est_t * p4est, int p)
+{
+  const p4est_gloidx_t *gfq;
+
+  P4EST_ASSERT (p4est != NULL);
+  P4EST_ASSERT (0 <= p && p < p4est->mpisize);
+
+  gfq = p4est->global_first_quadrant;
+  P4EST_ASSERT (gfq != NULL);
+
+  return gfq[p] == gfq[p + 1];
+}
+
+int
 p4est_comm_is_owner (p4est_t * p4est, p4est_locidx_t which_tree,
                      const p4est_quadrant_t * q, int rank)
 {
@@ -567,8 +850,8 @@ p4est_comm_checksum (p4est_t * p4est, unsigned local_crc, size_t local_bytes)
   if (p4est->mpirank == 0) {
     gather = P4EST_ALLOC (uint64_t, 2 * p4est->mpisize);
   }
-  mpiret = MPI_Gather (send, 2, MPI_LONG_LONG_INT,
-                       gather, 2, MPI_LONG_LONG_INT, 0, p4est->mpicomm);
+  mpiret = sc_MPI_Gather (send, 2, sc_MPI_LONG_LONG_INT,
+                          gather, 2, sc_MPI_LONG_LONG_INT, 0, p4est->mpicomm);
   SC_CHECK_MPI (mpiret);
 
   if (p4est->mpirank == 0) {
diff --git a/src/p4est_communication.h b/src/p4est_communication.h
index eb2a007..d0c6946 100644
--- a/src/p4est_communication.h
+++ b/src/p4est_communication.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -28,34 +29,78 @@
 
 SC_EXTERN_C_BEGIN;
 
-typedef enum
-{
-  P4EST_COMM_COUNT_PERTREE = 1,
-  P4EST_COMM_BALANCE_FIRST_COUNT,
-  P4EST_COMM_BALANCE_FIRST_LOAD,
-  P4EST_COMM_BALANCE_SECOND_COUNT,
-  P4EST_COMM_BALANCE_SECOND_LOAD,
-  P4EST_COMM_PARTITION_GIVEN,
-  P4EST_COMM_PARTITION_WEIGHTED_LOW,
-  P4EST_COMM_PARTITION_WEIGHTED_HIGH,
-  P4EST_COMM_PARTITION_CORRECTION,
-  P4EST_COMM_GHOST_COUNT,
-  P4EST_COMM_GHOST_LOAD,
-  P4EST_COMM_GHOST_EXCHANGE,
-  P4EST_COMM_GHOST_EXPAND_COUNT,
-  P4EST_COMM_GHOST_EXPAND_LOAD,
-  P4EST_COMM_GHOST_SUPPORT_COUNT,
-  P4EST_COMM_GHOST_SUPPORT_LOAD,
-  P4EST_COMM_GHOST_CHECKSUM,
-  P4EST_COMM_NODES_QUERY,
-  P4EST_COMM_NODES_REPLY,
-  P4EST_COMM_SAVE,
-  P4EST_COMM_LNODES_TEST,
-  P4EST_COMM_LNODES_PASS,
-  P4EST_COMM_LNODES_OWNED,
-  P4EST_COMM_LNODES_ALL
-}
-p4est_comm_tag_t;
+/** Assign an MPI communicator to p4est; retrieve parallel environment.
+ *
+ * \param [in] mpicomm    A valid MPI communicator.
+ *
+ * \note The provided MPI communicator is not owned by p4est.
+ */
+void                p4est_comm_parallel_env_assign (p4est_t * p4est,
+                                                    sc_MPI_Comm mpicomm);
+
+/** Duplicate MPI communicator and replace the current one by the duplicate.
+ *
+ * \note The duplicated MPI communicator is owned by p4est.
+ */
+void                p4est_comm_parallel_env_duplicate (p4est_t * p4est);
+
+/** Release MPI communicator if it is owned by p4est.
+ */
+void                p4est_comm_parallel_env_release (p4est_t * p4est);
+
+/** Replace the current MPI communicator by the one provided as input.
+ *
+ * \param [in] mpicomm    A valid MPI communicator.
+ *
+ * \note The provided MPI communicator is not owned by p4est.
+ */
+void                p4est_comm_parallel_env_replace (p4est_t * p4est,
+                                                     sc_MPI_Comm mpicomm);
+
+/** Retrieve parallel environment information.
+ */
+void                p4est_comm_parallel_env_get_info (p4est_t * p4est);
+
+/** Check if the MPI communicator is valid.
+ *
+ * \return True if communicator is not NULL communicator, false otherwise.
+ */
+int                 p4est_comm_parallel_env_is_null (p4est_t * p4est);
+
+/** Reduce MPI communicator to non-empty ranks (i.e., nonzero quadrant counts).
+ *
+ * \param [in/out] p4est_supercomm  Object which communicator is reduced.
+ *                                  Points to NULL if this p4est does not
+ *                                  exists.
+ *
+ * \return True if p4est exists on this MPI rank after reduction.
+ */
+int                 p4est_comm_parallel_env_reduce (p4est_t **
+                                                    p4est_supercomm);
+
+/** Reduce MPI communicator to non-empty ranks and add a group of ranks that
+ * will remain in the reduced communicator regardless whether they are empty
+ * or not.
+ *
+ * \param [in/out] p4est_supercomm  Object which communicator is reduced.
+ *                                  Points to NULL if this p4est does not
+ *                                  exists.
+ * \param [in] group_add         Group of ranks that will remain in
+ *                               communicator.
+ * \param [in] add_to_beginning  If true, ranks will be added to the beginning
+ *                               of the reduced communicator, otherwise to the
+ *                               end.
+ * \param[out] ranks_subcomm     If not null, array of size 'subcommsize' with
+ *                               subcommrank->supercommrank map.
+ *
+ * \return True if p4est exists on this MPI rank after reduction.
+ */
+int                 p4est_comm_parallel_env_reduce_ext (p4est_t **
+                                                        p4est_supercomm,
+                                                        sc_MPI_Group
+                                                        group_add,
+                                                        int add_to_beginning,
+                                                        int **ranks_subcomm);
 
 /** Caculate the number and partition of quadrents.
  * \param [in,out] p4est  Adds all \c p4est->local_num_quadrant counters and
@@ -84,6 +129,13 @@ void                p4est_comm_global_partition (p4est_t * p4est,
 void                p4est_comm_count_pertree (p4est_t * p4est,
                                               p4est_gloidx_t * pertree);
 
+/** Query whether a processor has no quadrants.
+ * \param [in] p4est    This forests' global_first_position array must be valid.
+ * \param [in] p        Valid processor id.
+ * \return              True if and only if processor \p is empty.
+ */
+int                 p4est_comm_is_empty (p4est_t * p4est, int p);
+
 /** Tests ownershop of a quadrant via p4est->global_first_position.
  * Assumes a tree with no overlaps.
  * \param [in] rank    Rank whose ownership is tested.
diff --git a/src/p4est_connectivity.c b/src/p4est_connectivity.c
index 0466b59..ac76612 100644
--- a/src/p4est_connectivity.c
+++ b/src/p4est_connectivity.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -136,9 +137,16 @@ p4est_connectivity_new_copy (p4est_topidx_t num_vertices,
   p4est_topidx_t      num_ctt;
   p4est_connectivity_t *conn;
 
+  P4EST_ASSERT (num_vertices >= 0);
+  P4EST_ASSERT (num_trees >= 0);
+
 #ifdef P4_TO_P8
+  P4EST_ASSERT (num_edges >= 0);
+  P4EST_ASSERT (eoff != NULL);
   num_ett = eoff[num_edges];
 #endif
+  P4EST_ASSERT (num_corners >= 0);
+  P4EST_ASSERT (coff != NULL);
   num_ctt = coff[num_corners];
   conn = p4est_connectivity_new (num_vertices, num_trees,
 #ifdef P4_TO_P8
@@ -193,6 +201,15 @@ p4est_connectivity_new (p4est_topidx_t num_vertices, p4est_topidx_t num_trees,
 {
   p4est_connectivity_t *conn;
 
+  P4EST_ASSERT (num_vertices >= 0);
+  P4EST_ASSERT (num_trees >= 0);
+#ifdef P4_TO_P8
+  P4EST_ASSERT (num_edges >= 0);
+  P4EST_ASSERT (num_ett >= 0);
+#endif
+  P4EST_ASSERT (num_corners >= 0);
+  P4EST_ASSERT (num_ctt >= 0);
+
   conn = P4EST_ALLOC_ZERO (p4est_connectivity_t, 1);
 
   conn->num_vertices = num_vertices;
@@ -244,6 +261,138 @@ p4est_connectivity_new (p4est_topidx_t num_vertices, p4est_topidx_t num_trees,
   return conn;
 }
 
+p4est_connectivity_t *
+p4est_connectivity_bcast (p4est_connectivity_t * conn_in, int root,
+                          sc_MPI_Comm mpicomm)
+{
+  int                 mpirank, mpiret;
+  p4est_connectivity_t *conn;
+  struct
+  {
+    p4est_topidx_t      num_vertices, num_trees, num_corners, num_ctt;
+    size_t              tree_attr_bytes;
+#ifdef P4_TO_P8
+    p4est_topidx_t      num_edges, num_ett;
+#endif
+  }
+  conn_dimensions;
+
+  mpiret = sc_MPI_Comm_rank (mpicomm, &mpirank);
+  SC_CHECK_MPI (mpiret);
+  /* fill dims_buffer on root process */
+  if (mpirank == root) {
+    P4EST_ASSERT (conn_in != NULL);
+    memset (&conn_dimensions, -1, sizeof (conn_dimensions));
+    conn = conn_in;
+    conn_dimensions.num_corners = conn->num_corners;
+    conn_dimensions.num_trees = conn->num_trees;
+    conn_dimensions.num_vertices = conn->num_vertices;
+    conn_dimensions.tree_attr_bytes = conn->tree_attr_bytes;
+    conn_dimensions.num_ctt = conn->ctt_offset[conn->num_corners];
+#ifdef P4_TO_P8
+    conn_dimensions.num_edges = conn->num_edges;
+    conn_dimensions.num_ett = conn->ett_offset[conn->num_edges];
+#endif
+  }
+  else {
+    P4EST_ASSERT (conn_in == NULL);
+    conn = NULL;                /* suppress 'maybe used ininitialized' warning */
+  }
+  /* broadcast the dimensions to all processes */
+  mpiret = sc_MPI_Bcast (&conn_dimensions, sizeof (conn_dimensions),
+                         sc_MPI_BYTE, root, mpicomm);
+  SC_CHECK_MPI (mpiret);
+
+  /* allocate memory for new connectivity */
+  if (mpirank != root) {
+    P4EST_ASSERT (conn == NULL);
+    conn = p4est_connectivity_new (conn_dimensions.num_vertices,
+                                   conn_dimensions.num_trees,
+#ifdef P4_TO_P8
+                                   conn_dimensions.num_edges,
+                                   conn_dimensions.num_ett,
+#endif
+                                   conn_dimensions.num_corners,
+                                   conn_dimensions.num_ctt);
+    p4est_connectivity_set_attr (conn, conn_dimensions.tree_attr_bytes);
+  }
+
+  /* Broadcast the arrays if not NULL.  If a pointer is NULL on one process
+   * then it is NULL on every process, therefore the if-constructions work */
+  if (conn->num_vertices > 0) {
+    P4EST_ASSERT (conn->vertices != NULL);
+    P4EST_ASSERT (conn->tree_to_vertex != NULL);
+    mpiret = sc_MPI_Bcast (conn->vertices, 3 * conn_dimensions.num_vertices,
+                           sc_MPI_DOUBLE, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Bcast (conn->tree_to_vertex,
+                           P4EST_CHILDREN * conn_dimensions.num_trees,
+                           P4EST_MPI_TOPIDX, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+  }
+
+  mpiret =
+    sc_MPI_Bcast (conn->tree_to_tree, P4EST_FACES * conn_dimensions.num_trees,
+                  P4EST_MPI_TOPIDX, root, mpicomm);
+  SC_CHECK_MPI (mpiret);
+  mpiret =
+    sc_MPI_Bcast (conn->tree_to_face, P4EST_FACES * conn_dimensions.num_trees,
+                  sc_MPI_BYTE, root, mpicomm);
+  SC_CHECK_MPI (mpiret);
+
+  if (conn->num_corners > 0) {
+    P4EST_ASSERT (conn->tree_to_corner != NULL);
+    P4EST_ASSERT (conn->corner_to_tree != NULL);
+    P4EST_ASSERT (conn->corner_to_corner != NULL);
+    mpiret = sc_MPI_Bcast (conn->tree_to_corner,
+                           P4EST_CHILDREN * conn_dimensions.num_trees,
+                           P4EST_MPI_TOPIDX, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Bcast (conn->corner_to_tree, conn_dimensions.num_ctt,
+                           P4EST_MPI_TOPIDX, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Bcast (conn->corner_to_corner, conn_dimensions.num_ctt,
+                           sc_MPI_BYTE, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+  }
+
+  mpiret = sc_MPI_Bcast (conn->ctt_offset, conn_dimensions.num_corners,
+                         P4EST_MPI_TOPIDX, root, mpicomm);
+  P4EST_ASSERT (conn->ctt_offset[conn->num_corners] ==
+                conn_dimensions.num_ctt);
+  SC_CHECK_MPI (mpiret);
+#ifdef P4_TO_P8
+  if (conn->num_edges > 0) {
+    P4EST_ASSERT (conn->tree_to_edge != NULL);
+    P4EST_ASSERT (conn->edge_to_tree != NULL);
+    P4EST_ASSERT (conn->edge_to_edge != NULL);
+    mpiret = sc_MPI_Bcast (conn->tree_to_edge,
+                           P8EST_EDGES * conn_dimensions.num_trees,
+                           P4EST_MPI_TOPIDX, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Bcast (conn->edge_to_tree, conn_dimensions.num_ett,
+                           P4EST_MPI_TOPIDX, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+    mpiret = sc_MPI_Bcast (conn->edge_to_edge, conn_dimensions.num_ett,
+                           sc_MPI_BYTE, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+  }
+  mpiret = sc_MPI_Bcast (conn->ett_offset, conn_dimensions.num_edges,
+                         P4EST_MPI_TOPIDX, root, mpicomm);
+  P4EST_ASSERT (conn->ett_offset[conn->num_edges] == conn_dimensions.num_ett);
+  SC_CHECK_MPI (mpiret);
+#endif
+
+  if (conn->tree_attr_bytes != 0) {
+    mpiret = sc_MPI_Bcast (conn->tree_to_attr,
+                           conn->tree_attr_bytes * conn->num_trees,
+                           sc_MPI_BYTE, root, mpicomm);
+    SC_CHECK_MPI (mpiret);
+  }
+  P4EST_ASSERT (p4est_connectivity_is_valid (conn));
+  return conn;
+}
+
 void
 p4est_connectivity_destroy (p4est_connectivity_t * conn)
 {
@@ -463,7 +612,7 @@ p4est_connectivity_is_valid (p4est_connectivity_t * conn)
           }
           nflip = (int) ete[nett] / P8EST_EDGES;
           if (ntree == tree && nedge == edge) {
-            if (flip != -1) {
+            if (flip != -1 && nflip == flip) {
               errcode = 1;
               break;
             }
@@ -1475,6 +1624,192 @@ p4est_connectivity_new_disk (void)
 
 #endif /* !P4_TO_P8 */
 
+p4est_connectivity_t *
+p4est_connectivity_new_twotrees (int l_face, int r_face, int orientation)
+{
+  int                 i;
+  const p4est_topidx_t num_vertices = (P4EST_DIM - 1) * 6;      /* 6 or 12 */
+  const p4est_topidx_t num_trees = 2;
+
+  /* no tree connection via edges and corners */
+#ifdef P4_TO_P8
+  int                 op;
+  const p4est_topidx_t num_edges = 0;
+  const p4est_topidx_t num_ett = 0;
+#endif /* P4_TO_P8 */
+  const p4est_topidx_t num_corners = 0;
+  const p4est_topidx_t num_ctt = 0;
+
+/* *INDENT-OFF* */
+  const double        vertices[(P4EST_DIM - 1) * 6 * 3] = {
+    0, 0, 0,
+    1, 0, 0,
+    2, 0, 0,
+    0, 1, 0,
+    1, 1, 0,
+    2, 1, 0
+#ifdef P4_TO_P8
+           ,
+    0, 0, 1,
+    1, 0, 1,
+    2, 0, 1,
+    0, 1, 1,
+    1, 1, 1,
+    2, 1, 1
+#endif /* P4_TO_P8 */
+  };
+
+  /* define mapping from tree to vertex for each face */
+  const int           leftTree[P4EST_FACES][P4EST_CHILDREN] =
+#ifndef P4_TO_P8
+    {{ 1, 0, 4, 3 },
+     { 0, 1, 3, 4 },
+     { 1, 4, 0, 3 },
+     { 0, 3, 1, 4}};
+#else /* !P4_TO_P8 */
+    {{  1,  0,  7,  6,  4,  3, 10,  9 },
+     {  0,  1,  3,  4,  6,  7,  9, 10 },
+     {  1,  4,  0,  3,  7, 10,  6,  9 },
+     {  0,  6,  1,  7,  3,  9,  4, 10 },
+     {  1,  7,  4, 10,  0,  6,  3,  9 },
+     {  0,  3,  6,  9,  1,  4,  7, 10 }};
+#endif /* !P4_TO_P8 */
+
+  const int           rightTree[P4EST_FACES][P4EST_CHILDREN] =
+#ifndef P4_TO_P8
+    {{ 1, 2, 4, 5 },
+     { 2, 1, 5, 4 },
+     { 1, 4, 2, 5 },
+     { 2, 5, 1, 4 }};
+#else /* !P4_TO_P8 */
+    {{  1,  2,  4,  5,  7,  8, 10, 11 },
+     {  2,  1,  8,  7,  5,  4, 11, 10 },
+     {  1,  7,  2,  8,  4, 10,  5, 11 },
+     {  2,  5,  1,  4,  8, 11,  7, 10 },
+     {  1,  4,  7, 10,  2,  5,  8, 11 },
+     {  2,  8,  5, 11,  1,  7,  4, 10 }};
+#endif /* !P4_TO_P8 */
+
+  /* define rotations for right tree in order to set the orientation */
+  const int flip[(P4EST_DIM - 1) * 6] =
+#ifndef P4_TO_P8
+    { -1,  4,  5, -1,  1,  2 };
+#else /* !P4_TO_P8 */
+    { -1, 10, 11, -1,  7,  8, -1,  4,  5, -1,  1,  2 };
+  const int rotateClockWise[(P4EST_DIM - 1) * 6] =
+    { -1,  7,  8, -1,  1,  2, -1, 10, 11, -1,  4,  5 };
+  const int rotateCounterClockWise[(P4EST_DIM - 1) * 6] =
+    { -1,  4,  5, -1, 10, 11, -1,  1,  2, -1,  7,  8 };
+#endif /* !P4_TO_P8 */
+/* *INDENT-ON* */
+
+  /* initialize values in tree_to_vertex */
+  p4est_topidx_t      tree_to_vertex[P4EST_CHILDREN * 2] = {
+    -1, -1, -1, -1, -1, -1, -1, -1
+#ifdef P4_TO_P8
+      ,
+    -1, -1, -1, -1, -1, -1, -1, -1
+#endif /* P4_TO_P8 */
+  };
+
+  P4EST_ASSERT (0 <= l_face && l_face < P4EST_FACES);
+  P4EST_ASSERT (0 <= r_face && r_face < P4EST_FACES);
+  P4EST_ASSERT (0 <= orientation && orientation < P4EST_HALF);
+
+  /* populate according to specified faces */
+  for (i = 0; i < P4EST_CHILDREN; ++i) {
+    tree_to_vertex[i] = leftTree[l_face][i];
+    tree_to_vertex[P4EST_CHILDREN + i] = rightTree[r_face][i];
+  }
+
+  /* rotate trees such that the corners fall to the respective places
+     as specified */
+#ifndef P4_TO_P8
+  if (orientation == 1) {
+    for (i = 0; i < P4EST_CHILDREN; ++i) {
+      tree_to_vertex[P4EST_CHILDREN + i] =
+        flip[tree_to_vertex[P4EST_CHILDREN + i]];
+    }
+  }
+#else /* P4_TO_P8 */
+  op = -1;
+  if (orientation == 3) {
+    op = 2;
+  }
+  else if (1 <= orientation && orientation <= 2) {
+    if (l_face <= r_face) {
+      op = p8est_face_permutation_refs[l_face][r_face];
+    }
+    else {
+      op = p8est_face_permutation_refs[r_face][l_face];
+    }
+  }
+  switch (op) {
+  case 0:                      /* clockwise rotation */
+    for (i = 0; i < P4EST_CHILDREN; ++i) {
+      tree_to_vertex[P4EST_CHILDREN + i] =
+        rotateClockWise[tree_to_vertex[P4EST_CHILDREN + i]];
+    }
+    break;
+  case 1:                      /* counterclockwise rotation */
+    for (i = 0; i < P4EST_CHILDREN; ++i) {
+      tree_to_vertex[P4EST_CHILDREN + i] =
+        rotateCounterClockWise[tree_to_vertex[P4EST_CHILDREN + i]];
+    }
+    break;
+  case 2:                      /* flip */
+    for (i = 0; i < P4EST_CHILDREN; ++i) {
+      tree_to_vertex[P4EST_CHILDREN + i] =
+        flip[tree_to_vertex[P4EST_CHILDREN + i]];
+    }
+    break;
+  default:
+    /* we do nothing */
+    break;
+  }
+#endif /* P4_TO_P8 */
+
+/* *INDENT-OFF* */
+  /* create tree_to_tree and tree_to_face */
+  p4est_topidx_t tree_to_tree[2 * P4EST_FACES] =
+#ifndef P4_TO_P8
+    {0, 0, 0, 0,
+     1, 1, 1, 1};
+#else /* !P4_TO_P8 */
+    {0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 1, 1};
+#endif /* !P4_TO_P8 */
+  int8_t tree_to_face[2 * P4EST_FACES] =
+#ifndef P4_TO_P8
+    {0, 1, 2, 3,
+     0, 1, 2, 3,};
+#else /* !P4_TO_P8 */
+    {0, 1, 2, 3, 4, 5,
+     0, 1, 2, 3, 4, 5};
+#endif /* !P4_TO_P8 */
+/* *INDENT-ON* */
+
+  /* set values where trees are connected */
+  tree_to_tree[l_face] = 1;
+  tree_to_tree[P4EST_FACES + r_face] = 0;
+
+  tree_to_face[l_face] = (int8_t) (P4EST_FACES * orientation + r_face);
+  tree_to_face[P4EST_FACES + r_face] =
+    (int8_t) (P4EST_FACES * orientation + l_face);
+
+  /* create connectivity structure */
+  return p4est_connectivity_new_copy (num_vertices, num_trees,
+#ifdef P4_TO_P8
+                                      num_edges,
+#endif /* P4_TO_P8 */
+                                      num_corners, vertices, tree_to_vertex,
+                                      tree_to_tree, tree_to_face,
+#ifdef P4_TO_P8
+                                      NULL, &num_ett, NULL, NULL,
+#endif /* P4_TO_P8 */
+                                      NULL, &num_ctt, NULL, NULL);
+}
+
 static inline void
 brick_linear_to_xyz (p4est_topidx_t ti, const int logx[P4EST_DIM],
                      const int rankx[P4EST_DIM], p4est_topidx_t tx[P4EST_DIM])
@@ -2254,72 +2589,104 @@ p4est_find_corner_transform_internal (p4est_connectivity_t * conn,
                                       p4est_topidx_t itree, int icorner,
                                       p4est_corner_info_t * ci,
                                       p4est_topidx_t * ctt, int8_t * ctc,
-                                      p4est_topidx_t corner_trees,
-                                      p4est_topidx_t ntree[P4EST_DIM])
+                                      p4est_topidx_t corner_trees)
 {
-  int                 i;
-  int                 iface[P4EST_DIM], nface[P4EST_DIM];
-  int                 orient[P4EST_DIM], fcorner[P4EST_DIM];
+  int                 i, j;
+  int                 iface, nface;
+  int                 orient, fcorner;
   int                 ncorner, ncode;
   int                 fc, nc;
-  int                 omit;
   p4est_topidx_t      ctree, nctree;
 #ifdef P4_TO_P8
-  int                 iedge[3], iwhich[3];
+  int                 iedge, iwhich, k;
   int                 pref, pset;
   size_t              jz;
-  p4est_topidx_t      aedge[3];
-  p8est_edge_info_t   ei[3];
-  sc_array_t         *eta[3];
+  p4est_topidx_t      aedge;
+  p8est_edge_info_t   ei;
+  sc_array_t         *eta;
   p8est_edge_transform_t *et;
 #endif
   sc_array_t         *cta = &ci->corner_transforms;
   p4est_corner_transform_t *ct;
-  int                 edge_ignored = 0;
+  int                 ndistinct = 1;
+  sc_array_t          distinct;
+  p4est_topidx_t      ntree;
 
   P4EST_ASSERT (0 <= itree && itree < conn->num_trees);
   P4EST_ASSERT (0 <= icorner && icorner < P4EST_CHILDREN);
   P4EST_ASSERT (cta->elem_size == sizeof (p4est_corner_transform_t));
 
+  sc_array_init_size (&distinct, sizeof (p4est_corner_transform_t), 1);
+  ct = (p4est_corner_transform_t *) sc_array_index (&distinct, 0);
+  ct->ntree = itree;
+  ct->ncorner = icorner;
+
   /* find the face neighbors */
   for (i = 0; i < P4EST_DIM; ++i) {
-    iface[i] = p4est_corner_faces[icorner][i];
-    ntree[i] = conn->tree_to_tree[P4EST_FACES * itree + iface[i]];
-    ncode = (int) conn->tree_to_face[P4EST_FACES * itree + iface[i]];
-    if (ntree[i] == itree && ncode == iface[i]) {       /* domain boundary */
-      ntree[i] = -1;
-      nface[i] = -1;
-      orient[i] = -1;
-      fcorner[i] = -1;
-    }
-    else {
-      nface[i] = ncode % P4EST_FACES;
-      orient[i] = ncode / P4EST_FACES;
-      fcorner[i] = p4est_corner_face_corners[icorner][iface[i]];
-      P4EST_ASSERT (fcorner[i] >= 0);
+    iface = p4est_corner_faces[icorner][i];
+    ntree = conn->tree_to_tree[P4EST_FACES * itree + iface];
+    ncode = (int) conn->tree_to_face[P4EST_FACES * itree + iface];
+    if (ntree != itree || ncode != iface) {     /* not domain boundary */
+      nface = ncode % P4EST_FACES;
+      orient = ncode / P4EST_FACES;
+      fcorner = p4est_corner_face_corners[icorner][iface];
+      P4EST_ASSERT (fcorner >= 0);
+#ifdef P4_TO_P8
+      pref = p8est_face_permutation_refs[iface][nface];
+      pset = p8est_face_permutation_sets[pref][orient];
+      fc = p8est_face_permutations[pset][fcorner];
+#else
+      fc = fcorner ^ orient;
+#endif
+      nc = p4est_face_corners[nface][fc];
+      for (j = 0; j < ndistinct; j++) {
+        ct = (p4est_corner_transform_t *) sc_array_index_int (&distinct, j);
+        if (ntree == ct->ntree && nc == (int) ct->ncorner) {
+          break;
+        }
+      }
+      if (j == ndistinct) {
+        ct = (p4est_corner_transform_t *) sc_array_push (&distinct);
+        ct->ntree = ntree;
+        ct->ncorner = nc;
+        ndistinct++;
+      }
     }
   }
 
 #ifdef P4_TO_P8
   /* find the three edge transforms */
-  if (conn->num_edges == 0) {
-    eta[0] = eta[1] = eta[2] = NULL;
-    aedge[0] = aedge[1] = aedge[2] = -1;
-  }
-  else {
+  if (conn->num_edges != 0) {
     for (i = 0; i < 3; ++i) {
-      iedge[i] = p8est_corner_edges[icorner][i];
-      aedge[i] = conn->tree_to_edge[P8EST_EDGES * itree + iedge[i]];
-      if (aedge[i] == -1) {
-        eta[i] = NULL;
+      iedge = p8est_corner_edges[icorner][i];
+      aedge = conn->tree_to_edge[P8EST_EDGES * itree + iedge];
+      if (aedge == -1) {
         continue;
       }
-      iwhich[i] = (p8est_edge_corners[iedge[i]][1] == icorner);
-      P4EST_ASSERT (p8est_edge_corners[iedge[i]][iwhich[i]] == icorner);
+      iwhich = (p8est_edge_corners[iedge][1] == icorner);
+      P4EST_ASSERT (p8est_edge_corners[iedge][iwhich] == icorner);
 
-      eta[i] = &ei[i].edge_transforms;
-      sc_array_init (eta[i], sizeof (p8est_edge_transform_t));
-      p8est_find_edge_transform (conn, itree, iedge[i], &ei[i]);
+      eta = &ei.edge_transforms;
+      sc_array_init (eta, sizeof (p8est_edge_transform_t));
+      p8est_find_edge_transform (conn, itree, iedge, &ei);
+      for (jz = 0; jz < eta->elem_count; ++jz) {
+        et = p8est_edge_array_index (eta, jz);
+        ntree = et->ntree;
+        nc = p8est_edge_corners[et->nedge][et->nflip ^ iwhich];
+        for (k = 0; k < ndistinct; k++) {
+          ct = (p4est_corner_transform_t *) sc_array_index_int (&distinct, k);
+          if (ntree == ct->ntree && nc == (int) ct->ncorner) {
+            break;
+          }
+        }
+        if (k == ndistinct) {
+          ct = (p4est_corner_transform_t *) sc_array_push (&distinct);
+          ct->ntree = ntree;
+          ct->ncorner = nc;
+          ndistinct++;
+        }
+      }
+      sc_array_reset (eta);
     }
   }
 #endif
@@ -2330,60 +2697,17 @@ p4est_find_corner_transform_internal (p4est_connectivity_t * conn,
     P4EST_ASSERT (0 <= nctree && nctree < conn->num_trees);
     ncorner = (int) ctc[ctree];
     P4EST_ASSERT (ncorner >= 0 && ncorner < P4EST_CHILDREN);
-    if (ncorner == icorner && nctree == itree) {
-      continue;
-    }
-
-    /* rule out face neighbors */
-    omit = 0;
-    for (i = 0; i < P4EST_DIM; ++i) {
-      if (nctree == ntree[i]) {
-        P4EST_ASSERT (fcorner[i] >= 0);
-#ifdef P4_TO_P8
-        pref = p8est_face_permutation_refs[iface[i]][nface[i]];
-        pset = p8est_face_permutation_sets[pref][orient[i]];
-        fc = p8est_face_permutations[pset][fcorner[i]];
-#else
-        fc = fcorner[i] ^ orient[i];
-#endif
-        nc = p4est_face_corners[nface[i]][fc];
-
-        if (nc == ncorner) {
-          omit = 1;
-          break;
-        }
-      }
-    }
-    if (omit)
-      continue;
 
-#ifdef P4_TO_P8
-    /* rule out edge neighbors */
-    omit = 0;
-    for (i = 0; i < 3; ++i) {
-      if (aedge[i] == -1) {
-        continue;
-      }
-      for (jz = 0; jz < eta[i]->elem_count; ++jz) {
-        et = p8est_edge_array_index (eta[i], jz);
-        if (nctree == et->ntree) {
-          P4EST_ASSERT ((iwhich[i] & ~1) == 0);
-          nc = p8est_edge_corners[et->nedge][et->nflip ^ iwhich[i]];
-
-          if (nc == ncorner) {
-            omit = 1;
-            break;
-          }
-        }
-      }
-      if (omit)
+    /* compare against corners found via self, faces [and edges (3D)] */
+    for (j = 0; j < ndistinct; j++) {
+      ct = (p4est_corner_transform_t *) sc_array_index_int (&distinct, j);
+      if (nctree == ct->ntree && ncorner == (int) ct->ncorner) {
         break;
+      }
     }
-    if (omit) {
-      ++edge_ignored;
+    if (j < ndistinct) {
       continue;
     }
-#endif
 
     /* else we have a true all-diagonal corner with ntree */
     ct = (p4est_corner_transform_t *) sc_array_push (cta);
@@ -2391,15 +2715,9 @@ p4est_find_corner_transform_internal (p4est_connectivity_t * conn,
     ct->ncorner = (int8_t) ncorner;
   }
 
-#ifdef P4_TO_P8
-  for (i = 0; i < 3; ++i) {
-    if (aedge[i] >= 0) {
-      sc_array_reset (eta[i]);
-    }
-  }
-#endif
+  sc_array_reset (&distinct);
 
-  return edge_ignored;
+  return ndistinct;
 }
 
 void
@@ -2407,11 +2725,10 @@ p4est_find_corner_transform (p4est_connectivity_t * conn,
                              p4est_topidx_t itree, int icorner,
                              p4est_corner_info_t * ci)
 {
-  int                 ignored;
 #ifdef P4EST_ENABLE_DEBUG
-  size_t              expected_count;
+  int                 ignored;
 #endif
-  p4est_topidx_t      ntree[P4EST_DIM], corner_trees, acorner, cttac;
+  p4est_topidx_t      corner_trees, acorner, cttac;
   sc_array_t         *cta = &ci->corner_transforms;
 
   P4EST_ASSERT (0 <= itree && itree < conn->num_trees);
@@ -2436,21 +2753,17 @@ p4est_find_corner_transform (p4est_connectivity_t * conn,
   P4EST_ASSERT (0 <= cttac && 1 <= corner_trees);
 
   /* loop through all corner neighbors and find corner connections */
-  ignored = p4est_find_corner_transform_internal (conn, itree, icorner, ci,
-                                                  conn->corner_to_tree +
-                                                  cttac,
-                                                  conn->corner_to_corner +
-                                                  cttac, corner_trees, ntree);
 #ifdef P4EST_ENABLE_DEBUG
-  expected_count = cta->elem_count + 1 + (ntree[0] != -1) + (ntree[1] != -1);
-#ifdef P4_TO_P8
-  expected_count += (ntree[2] != -1);
+  ignored =
 #else
-  P4EST_ASSERT (ignored == 0);
-#endif
-
-  P4EST_ASSERT (corner_trees == (p4est_topidx_t) (expected_count + ignored));
+  (void)
 #endif
+    p4est_find_corner_transform_internal (conn, itree, icorner, ci,
+                                          conn->corner_to_tree +
+                                          cttac,
+                                          conn->corner_to_corner +
+                                          cttac, corner_trees);
+  P4EST_ASSERT (corner_trees == (p4est_topidx_t) (cta->elem_count + ignored));
 }
 
 void
@@ -2475,13 +2788,9 @@ p4est_connectivity_complete (p4est_connectivity_t * conn)
   sc_array_t          edge_array, edge_to_pz;
   sc_array_t         *eta = &einfo.edge_transforms;
 #endif
-  int                 ignored;
   int8_t             *ct;
   size_t              zcount;
-#ifdef P4EST_ENABLE_DEBUG
-  size_t              expected_count;
-#endif
-  p4est_topidx_t      real_corners, ntree[P4EST_DIM];
+  p4est_topidx_t      real_corners;
   p4est_topidx_t      ctt_count, ctt_offset, ctt_corner;
   p4est_corner_info_t cinfo;
   sc_array_t         *node_trees, *nt;
@@ -2697,20 +3006,13 @@ p4est_connectivity_complete (p4est_connectivity_t * conn)
       pt = (p4est_topidx_t *) sc_array_index (&ei->trees, egz);
       et = (int8_t *) sc_array_index (&ei->edges, egz);
       einfo.iedge = -1;         /* unused */
-      P4EST_EXECUTE_ASSERT_FALSE
-        (p8est_find_edge_transform_internal (conn, *pt, *et, &einfo,
-                                             conn->edge_to_tree +
-                                             ett_offset,
-                                             conn->edge_to_edge +
-                                             ett_offset,
-                                             ei->trees.elem_count, ntree));
-      if (eta->elem_count == 0) {
-        P4EST_ASSERT (ntree[0] != -1 || ntree[1] != -1);
-      }
-      else {
+      p8est_find_edge_transform_internal (conn, *pt, *et, &einfo,
+                                          conn->edge_to_tree +
+                                          ett_offset,
+                                          conn->edge_to_edge +
+                                          ett_offset, ei->trees.elem_count);
+      if (eta->elem_count != 0) {
         /* edge is non-redundant */
-        P4EST_ASSERT (ei->trees.elem_count == eta->elem_count
-                      + 1 + (ntree[0] != -1) + (ntree[1] != -1));
         break;
       }
     }
@@ -2817,32 +3119,14 @@ p4est_connectivity_complete (p4est_connectivity_t * conn)
         pt = (p4est_topidx_t *) sc_array_index (nt, pz);
         ct = (int8_t *) sc_array_index (nc, pz);
         cinfo.icorner = -1;     /* unused */
-        ignored =
+        (void)
           p4est_find_corner_transform_internal (conn, *pt, *ct, &cinfo,
                                                 conn->corner_to_tree +
                                                 ctt_offset,
                                                 conn->corner_to_corner +
-                                                ctt_offset, zcount, ntree);
-#ifndef P4_TO_P8
-        P4EST_ASSERT (ignored == 0);
-#endif
-        if (cta->elem_count == 0) {
-#ifndef P4_TO_P8
-          P4EST_ASSERT (ntree[0] != -1 || ntree[1] != -1);
-#else
-          P4EST_ASSERT (ntree[0] != -1 || ntree[1] != -1 || ntree[2] != -1);
-#endif
-        }
-        else {
+                                                ctt_offset, zcount);
+        if (cta->elem_count != 0) {
           /* corner is non-redundant */
-#ifdef P4EST_ENABLE_DEBUG
-          expected_count =
-            cta->elem_count + 1 + (ntree[0] != -1) + (ntree[1] != -1);
-#ifdef P4_TO_P8
-          expected_count += (ntree[2] != -1);
-#endif
-          P4EST_ASSERT (zcount == expected_count + ignored);
-#endif
           break;
         }
       }
@@ -3760,12 +4044,13 @@ int
 p4est_connectivity_is_equivalent (p4est_connectivity_t * conn1,
                                   p4est_connectivity_t * conn2)
 {
+  const size_t        topsize = sizeof (p4est_topidx_t);
+  const size_t        int8size = sizeof (int8_t);
   size_t              count;
   p4est_topidx_t      ntrees, t;
+
   P4EST_ASSERT (p4est_connectivity_is_valid (conn1));
   P4EST_ASSERT (p4est_connectivity_is_valid (conn2));
-  size_t              topsize = sizeof (p4est_topidx_t);
-  size_t              int8size = sizeof (int8_t);
 
   /* same pointer or equality are stronger */
   if (conn1 == conn2 || p4est_connectivity_is_equal (conn1, conn2)) {
@@ -3930,14 +4215,17 @@ p4est_connectivity_getline_upper (FILE * stream)
     }
 
     if (--len == 0) {
+      char               *linen;
+
       len = lenmax;
       lenmax *= 2;
-      char               *linen = P4EST_REALLOC (linep, char, lenmax);
 
+      linen = P4EST_REALLOC (linep, char, lenmax);
       if (linen == NULL) {
         P4EST_FREE (linep);
         return NULL;
       }
+
       line = linen + (line - linep);
       linep = linen;
     }
@@ -4032,38 +4320,40 @@ p4est_connectivity_read_inp_stream (FILE * stream,
     }
     else if (reading_elements) {
       if (fill_trees_and_vertices) {
+        long long int       element_number;
         long long int       v[P4EST_CHILDREN];
         int                 n;
         int                 retval;
 
-        if (num_elements >= *num_trees) {
-          P4EST_LERROR ("Encountered element that will not fit into"
-                        " tree_to_vertex array. More elements than expected.\n");
-          P4EST_FREE (line);
-          return 1;
-        }
-
         /* Note that when we read in the
          * vertices we switch from right-hand
          * vertex ordering to z-order
          */
-        retval = sscanf (line, "%*d, %lld, %lld, %lld, %lld"
+        retval = sscanf (line, "%lld, %lld, %lld, %lld, %lld"
 #ifdef P4_TO_P8
                          ", %lld, %lld, %lld, %lld"
 #endif
-                         , &v[0], &v[1], &v[3], &v[2]
+                         , &element_number, &v[0], &v[1], &v[3], &v[2]
 #ifdef P4_TO_P8
                          , &v[4], &v[5], &v[7], &v[6]
 #endif
           );
-        if (retval != P4EST_CHILDREN) {
+        if (retval != P4EST_CHILDREN + 1) {
           P4EST_LERROR ("Premature end of file");
           P4EST_FREE (line);
           return 1;
         }
 
+        if (element_number > *num_trees) {
+          P4EST_LERROR ("Encountered element that will not fit into"
+                        " tree_to_vertex array. More elements than expected.\n");
+          P4EST_FREE (line);
+          return 1;
+        }
+
         for (n = 0; n < P4EST_CHILDREN; ++n)
-          tree_to_vertex[P4EST_CHILDREN * num_elements + n] = v[n] - 1;
+          tree_to_vertex[P4EST_CHILDREN * (element_number - 1) + n] =
+            v[n] - 1;
       }
 
       ++num_elements;
diff --git a/src/p4est_connectivity.h b/src/p4est_connectivity.h
index 3ce9304..3456723 100644
--- a/src/p4est_connectivity.h
+++ b/src/p4est_connectivity.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -46,7 +47,7 @@ SC_EXTERN_C_BEGIN;
 #define P4EST_FACES (2 * P4EST_DIM)
 /** The number of children of a quadrant
  *
- * also the nmber of corners */
+ * also the number of corners */
 #define P4EST_CHILDREN 4
 /** The number of children/corners touching one face */
 #define P4EST_HALF (P4EST_CHILDREN / 2)
@@ -126,12 +127,14 @@ const char         *p4est_connect_type_string (p4est_connect_type_t btype);
  * [0][0]..[0][2]..[num_vertices-1][0]..[num_vertices-1][2].
  *
  * The corners are only stored when they connect trees.
+ * In this case tree_to_corner indexes into \a ctt_offset.
  * Otherwise the tree_to_corner entry must be -1 and this corner is ignored.
  * If num_corners == 0, tree_to_corner and corner_to_* arrays are set to NULL.
  *
  * The arrays corner_to_* store a variable number of entries per corner.
  * For corner c these are at position [ctt_offset[c]]..[ctt_offset[c+1]-1].
  * Their number for corner c is ctt_offset[c+1] - ctt_offset[c].
+ * The entries encode all trees adjacent to corner c.
  * The size of the corner_to_* arrays is num_ctt = ctt_offset[num_corners].
  *
  * The *_to_attr arrays may have arbitrary contents defined by the user.
@@ -233,6 +236,8 @@ p4est_connectivity_t *p4est_connectivity_new (p4est_topidx_t num_vertices,
  * \param [in] num_trees      Number of trees in the forest.
  * \param [in] num_corners    Number of tree-connecting corners.
  * \param [in] coff           Corner-to-tree offsets (num_corners + 1 values).
+ *                            This must always be non-NULL; in trivial cases
+ *                            it is just a pointer to a p4est_topix value of 0.
  * \return                    The connectivity is checked for validity.
  */
 p4est_connectivity_t *p4est_connectivity_new_copy (p4est_topidx_t
@@ -249,6 +254,21 @@ p4est_connectivity_t *p4est_connectivity_new_copy (p4est_topidx_t
                                                    const p4est_topidx_t * ctt,
                                                    const int8_t * ctc);
 
+/** Broadcast a connectivity structure that exists only on one process to all.
+ *  On the other processors, it will be allocated using p4est_connectivity_new.
+ *  \param [in] conn_in For the root process the connectivity to be broadcast,
+ *                      for the other processes it must be NULL.
+ *  \param [in] root    The rank of the process that provides the connectivity.
+ *  \param [in] comm    The MPI communicator.
+ *  \return             For the root process this is a pointer to \a conn_in.
+ *                      Else, a pointer to a newly allocated connectivity
+ *                      structure with the same values as \a conn_in on the
+ *                      root process.
+ */
+p4est_connectivity_t *p4est_connectivity_bcast (p4est_connectivity_t *
+                                                conn_in, int root,
+                                                sc_MPI_Comm comm);
+
 /** Destroy a connectivity structure.  Also destroy all attributes.
  */
 void                p4est_connectivity_destroy (p4est_connectivity_t *
@@ -335,6 +355,16 @@ p4est_connectivity_t *p4est_connectivity_new_periodic (void);
  */
 p4est_connectivity_t *p4est_connectivity_new_rotwrap (void);
 
+/** Create a connectivity structure for two trees being rotated
+ * w.r.t. each other in a user-defined way
+ * \param[in] l_face      index of left face
+ * \param[in] r_face      index of right face
+ * \param[in] orientation orientation of trees w.r.t. each other
+ */
+p4est_connectivity_t *p4est_connectivity_new_twotrees (int l_face,
+                                                       int r_face,
+                                                       int orientation);
+
 /** Create a connectivity structure for a three-tree mesh around a corner.
  */
 p4est_connectivity_t *p4est_connectivity_new_corner (void);
@@ -389,6 +419,18 @@ p4est_connectivity_t *p4est_connectivity_new_brick (int mi, int ni,
  */
 p4est_connectivity_t *p4est_connectivity_new_byname (const char *name);
 
+/** Uniformly refine a connectivity.
+ * This is useful if you would like to uniformly refine by something other
+ * than a power of 2.
+ *
+ * \param [in] conn         a valid connectivity
+ * \param [in] num_per_edge the number of new trees in each direction
+ *
+ * \return a refined connectivity.
+ */
+p4est_connectivity_t *p4est_connectivity_refine (p4est_connectivity_t * conn,
+                                                 int num_per_edge);
+
 /** Fill an array with the axis combination of a face neighbor transform.
  * \param [in]  iface       The number of the originating face.
  * \param [in]  nface       Encoded as nface = r * 4 + nf, where nf = 0..3 is
@@ -618,7 +660,7 @@ p4est_corner_array_index (sc_array_t * array, size_t it)
  * \endcode
  *
  * This code can be called two ways.  The first, when \c vertex==NULL and \c
- * tree_to_vertex==NULL, is used to count the number of tress and vertices in
+ * tree_to_vertex==NULL, is used to count the number of trees and vertices in
  * the connectivity to be generated by the \c .inp mesh in the \a stream.  The
  * second, when \c vertices!=NULL and \c tree_to_vertex!=NULL, fill \c vertices
  * and \c tree_to_vertex.  In this case \c num_vertices and \c num_trees need
diff --git a/src/p4est_connrefine.c b/src/p4est_connrefine.c
new file mode 100644
index 0000000..a8620a7
--- /dev/null
+++ b/src/p4est_connrefine.c
@@ -0,0 +1,179 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef P4_TO_P8
+#include <p4est_lnodes.h>
+#include <p4est_bits.h>
+#else
+#include <p8est_lnodes.h>
+#include <p8est_bits.h>
+#endif
+
+static void
+trilinear_interp (double (*v)[3], double eta[3], double xyz[3])
+{
+  int                 i;
+
+  for (i = 0; i < 3; i++) {
+    xyz[i] = (1. - eta[2]) * ((1. - eta[1]) * ((1. - eta[0]) * v[0][i]
+                                               + eta[0] * v[1][i]
+                              )
+                              + eta[1] * ((1. - eta[0]) * v[2][i]
+                                          + eta[0] * v[3][i]
+                              )
+      )
+#ifdef P4_TO_P8
+      + eta[2] * ((1. - eta[1]) * ((1. - eta[0]) * v[4][i]
+                                   + eta[0] * v[5][i]
+                  )
+                  + eta[1] * ((1. - eta[0]) * v[6][i]
+                              + eta[0] * v[7][i]
+                  )
+      )
+#endif
+      ;
+  }
+}
+
+p4est_connectivity_t *
+p4est_connectivity_refine (p4est_connectivity_t * conn_in, int num_per_edge)
+{
+  p4est_t            *dummy_forest;
+  p4est_ghost_t      *dummy_ghost;
+  p4est_lnodes_t     *dummy_lnodes;
+  p4est_connectivity_t *conn_out;
+  p4est_topidx_t      num_old_trees = conn_in->num_trees;
+  int                 ceillog = SC_LOG2_32 (num_per_edge - 1) + 1;
+  int                 ceil = (1 << ceillog);
+#ifndef P4_TO_P8
+  int                 N = num_per_edge * num_per_edge;
+  int                 M = ceil * ceil;
+#else
+  int                 N = num_per_edge * num_per_edge * num_per_edge;
+  int                 M = ceil * ceil * ceil;
+#endif
+  p4est_topidx_t      num_new_trees = num_old_trees * N;
+  p4est_topidx_t      num_new_vertices, ti, count;
+  int                 j;
+
+  P4EST_ASSERT (num_per_edge >= 1);
+
+  /* each processor redundantly creates the new connectivity */
+  dummy_forest = p4est_new (sc_MPI_COMM_SELF, conn_in, 0, 0, NULL);
+  dummy_ghost = p4est_ghost_new (dummy_forest, P4EST_CONNECT_FULL);
+  dummy_lnodes = p4est_lnodes_new (dummy_forest, dummy_ghost, num_per_edge);
+
+  num_new_vertices = (p4est_topidx_t) dummy_lnodes->num_local_nodes;
+
+  conn_out = p4est_connectivity_new (num_new_vertices, num_new_trees,
+#ifdef P4_TO_P8
+                                     0, 0,
+#endif
+                                     0, 0);
+
+  for (ti = 0; ti < num_new_trees; ti++) {
+    for (j = 0; j < P4EST_FACES; j++) {
+      conn_out->tree_to_tree[P4EST_FACES * ti + j] = ti;
+      conn_out->tree_to_face[P4EST_FACES * ti + j] = j;
+    }
+  }
+  for (count = 0, ti = 0; ti < num_old_trees; ti++) {
+    double              v[P4EST_CHILDREN][3];
+
+    for (j = 0; j < P4EST_CHILDREN; j++) {
+      int                 k;
+
+      for (k = 0; k < 3; k++) {
+        v[j][k] =
+          conn_in->vertices[3 *
+                            conn_in->tree_to_vertex[P4EST_CHILDREN * ti + j] +
+                            k];
+      }
+    }
+    for (j = 0; j < M; j++) {
+      p4est_quadrant_t    dummy;
+      uint64_t            R = j;
+      int                 x[P4EST_DIM], k;
+      int                 id, pow;
+      double              xyz[3];
+      p4est_topidx_t      thisvert;
+
+      p4est_quadrant_set_morton (&dummy, ceillog, R);
+
+      x[0] = (dummy.x >> (P4EST_MAXLEVEL - ceillog));
+      x[1] = (dummy.y >> (P4EST_MAXLEVEL - ceillog));
+#ifdef P4_TO_P8
+      x[2] = (dummy.z >> (P4EST_MAXLEVEL - ceillog));
+#endif
+      for (k = 0; k < P4EST_DIM; k++) {
+        if (x[k] >= num_per_edge) {
+          break;
+        }
+      }
+      if (k < P4EST_DIM) {
+        continue;
+      }
+
+      id = 0;
+      pow = 1;
+      for (k = 0; k < P4EST_DIM; k++) {
+        id += x[k] * pow;
+        pow *= (num_per_edge + 1);
+      }
+
+      for (k = 0; k < P4EST_CHILDREN; k++) {
+        int                 thisid = id, l;
+        double              eta[3] = { 0. };
+
+        pow = 1;
+        for (l = 0; l < P4EST_DIM; l++) {
+          int                 thisx = x[l];
+          int                 thisincr = (! !(k & 1 << l));
+
+          thisid += pow * thisincr;
+          pow *= (num_per_edge + 1);
+          eta[l] = ((double) (thisx + thisincr)) / ((double) num_per_edge);
+        }
+        P4EST_ASSERT (thisid < dummy_lnodes->vnodes);
+        trilinear_interp (v, eta, xyz);
+        conn_out->tree_to_vertex[P4EST_CHILDREN * count + k] = thisvert =
+          dummy_lnodes->element_nodes[dummy_lnodes->vnodes * ti + thisid];
+        for (l = 0; l < 3; l++) {
+          conn_out->vertices[3 * thisvert + l] = xyz[l];
+        }
+      }
+
+      count++;
+    }
+  }
+  P4EST_ASSERT (count == num_new_trees);
+
+  p4est_lnodes_destroy (dummy_lnodes);
+  p4est_ghost_destroy (dummy_ghost);
+  p4est_destroy (dummy_forest);
+
+  p4est_connectivity_complete (conn_out);
+
+  return conn_out;
+}
diff --git a/src/p4est_extended.h b/src/p4est_extended.h
index 4732c32..94b9253 100644
--- a/src/p4est_extended.h
+++ b/src/p4est_extended.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -42,6 +43,7 @@
 #include <p4est.h>
 #include <p4est_mesh.h>
 #include <p4est_iterate.h>
+#include <p4est_lnodes.h>
 
 SC_EXTERN_C_BEGIN;
 
@@ -146,6 +148,23 @@ p4est_mesh_t       *p4est_mesh_new_ext (p4est_t * p4est,
                                         int compute_level_lists,
                                         p4est_connect_type_t btype);
 
+/** Make a deep copy of a p4est.
+ * The connectivity is not duplicated.
+ * Copying of quadrant user data is optional.
+ * If old and new data sizes are 0, the user_data field is copied regardless.
+ * The inspect member of the copy is set to NULL.
+ * The revision counter of the copy is set to zero.
+ *
+ * \param [in]  copy_data  If true, data are copied.
+ *                         If false, data_size is set to 0.
+ * \param [in]  duplicate_mpicomm  If true, MPI communicator is copied.
+ * \return  Returns a valid p4est that does not depend on the input,
+ *                         except for borrowing the same connectivity.
+ *                         Its revision counter is 0.
+ */
+p4est_t            *p4est_copy_ext (p4est_t * input, int copy_data,
+                                    int duplicate_mpicomm);
+
 /** Refine a forest with a bounded refinement level and a replace option.
  * \param [in,out] p4est The forest is changed in place.
  * \param [in] refine_recursive Boolean to decide on recursive refinement.
@@ -235,6 +254,16 @@ p4est_gloidx_t      p4est_partition_ext (p4est_t * p4est,
                                          int partition_for_coarsening,
                                          p4est_weight_t weight_fn);
 
+/** Correct partition to allow one level of coarsening.
+ *
+ * \param [in] p4est                     forest whose partition is corrected
+ * \param [in,out] num_quadrants_in_proc partition that will be corrected
+ * \return                               absolute number of moved quadrants
+ */
+p4est_gloidx_t      p4est_partition_for_coarsening (p4est_t * p4est,
+                                                    p4est_locidx_t *
+                                                    num_quadrants_in_proc);
+
 /** p4est_iterate_ext adds the option \a remote: if this is false, then it is
  * the same as p4est_iterate; if this is true, then corner callbacks are also
  * called on corners for hanging faces touched by local quadrants.
@@ -303,6 +332,68 @@ p4est_t            *p4est_source_ext (sc_io_source_t * src,
                                       int broadcasthead, void *user_pointer,
                                       p4est_connectivity_t ** connectivity);
 
+/** Create the data necessary to create a PETsc DMPLEX representation of a
+ * forest, as well as the accompanying lnodes and ghost layer.  The forest
+ * must be at least face balanced (see p4est_balance()).  See
+ * test/test_plex2.c for example usage.
+ *
+ * All arrays should be initialized to hold sizeof (p4est_locidx_t), except
+ * for \a out_remotes, which should be initialized to hold
+ * (2 * sizeof (p4est_locidx_t)).
+ *
+ * \param[in]     p4est                 the forest
+ * \param[out]    ghost                 the ghost layer
+ * \param[out]    lnodes                the lnodes
+ * \param[in]     ctype                 the type of adjacency for the overlap
+ * \param[in]     overlap               the number of layers of overlap (zero
+ *                                      is acceptable)
+ * \param[out]    first_local_quad      the local quadrants are assigned
+ *                                      contiguous plex indices, starting with
+ *                                      this index
+ * \param[in,out] out_points_per_dim    filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_cone_sizes        filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_cones             filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_cone_orientations filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_vertex_coords     filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_children          filled with argument for
+ *                                      DMPlexSetTree()
+ * \param[in,out] out_parents           filled with argument for
+ *                                      DMPlexSetTree()
+ * \param[in,out] out_childids          filled with argument for
+ *                                      DMPlexSetTree()
+ * \param[in,out] out_leaves            filled with argument for
+ *                                      PetscSFSetGraph()
+ * \param[in,out] out_remotes           filled with argument for
+ *                                      PetscSFSetGraph()
+ * \param[in]     custom_numbering      Whether or use the default numbering
+ *                                      (0) of DMPlex child ids or the custom
+ *                                      (1).
+ */
+void                p4est_get_plex_data_ext (p4est_t * p4est,
+                                             p4est_ghost_t ** ghost,
+                                             p4est_lnodes_t ** lnodes,
+                                             p4est_connect_type_t ctype,
+                                             int overlap,
+                                             p4est_locidx_t *
+                                             first_local_quad,
+                                             sc_array_t * out_points_per_dim,
+                                             sc_array_t * out_cone_sizes,
+                                             sc_array_t * out_cones,
+                                             sc_array_t *
+                                             out_cone_orientations,
+                                             sc_array_t * out_vertex_coords,
+                                             sc_array_t * out_children,
+                                             sc_array_t * out_parents,
+                                             sc_array_t * out_childids,
+                                             sc_array_t * out_leaves,
+                                             sc_array_t * out_remotes,
+                                             int custom_numbering);
+
 SC_EXTERN_C_END;
 
 #endif /* !P4EST_EXTENDED_H */
diff --git a/src/p4est_geometry.c b/src/p4est_geometry.c
index 3b38482..e5317cc 100644
--- a/src/p4est_geometry.c
+++ b/src/p4est_geometry.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_geometry.h b/src/p4est_geometry.h
index 51ac1bd..f1df947 100644
--- a/src/p4est_geometry.h
+++ b/src/p4est_geometry.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,7 +22,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
-/** \file p4est_geometry.h transforms from vertex frame to physical space
+/** \file p4est_geometry.h transforms from vertex frame to physical space.
  *
  * \ingroup p4est
  */
@@ -33,6 +34,7 @@
 
 SC_EXTERN_C_BEGIN;
 
+/** This object encapsulates a custom geometry transformation. */
 typedef struct p4est_geometry p4est_geometry_t;
 
 /** Forward transformation from the reference unit square to physical space.
diff --git a/src/p4est_ghost.c b/src/p4est_ghost.c
index 6d948e5..6311482 100644
--- a/src/p4est_ghost.c
+++ b/src/p4est_ghost.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -2297,6 +2298,14 @@ void
 p4est_ghost_exchange_data (p4est_t * p4est, p4est_ghost_t * ghost,
                            void *ghost_data)
 {
+  p4est_ghost_exchange_data_end (p4est_ghost_exchange_data_begin
+                                 (p4est, ghost, ghost_data));
+}
+
+p4est_ghost_exchange_t *
+p4est_ghost_exchange_data_begin (p4est_t * p4est, p4est_ghost_t * ghost,
+                                 void *ghost_data)
+{
   size_t              zz;
   size_t              data_size;
 #ifdef P4EST_ENABLE_DEBUG
@@ -2306,8 +2315,10 @@ p4est_ghost_exchange_data (p4est_t * p4est, p4est_ghost_t * ghost,
   p4est_locidx_t      which_quad;
   p4est_quadrant_t   *mirror, *q;
   p4est_tree_t       *tree;
+  p4est_ghost_exchange_t *exc;
   void              **mirror_data;
 
+  /* allocate temporary storage */
   mirror_data = P4EST_ALLOC (void *, ghost->mirrors.elem_count);
 
   data_size = p4est->data_size == 0 ? sizeof (void *) : p4est->data_size;
@@ -2332,9 +2343,30 @@ p4est_ghost_exchange_data (p4est_t * p4est, p4est_ghost_t * ghost,
       p4est->data_size == 0 ? &q->p.user_data : q->p.user_data;
   }
 
-  p4est_ghost_exchange_custom (p4est, ghost, data_size,
-                               mirror_data, ghost_data);
+  /* delegate the rest of the work */
+  exc = p4est_ghost_exchange_custom_begin (p4est, ghost, data_size,
+                                           mirror_data, ghost_data);
+  P4EST_ASSERT (exc->is_custom);
+  P4EST_ASSERT (!exc->is_levels);
+  exc->is_custom = 0;
+
+  /* the mirror_data is copied before sending so it can be freed */
   P4EST_FREE (mirror_data);
+
+  /* return message buffers */
+  return exc;
+}
+
+void
+p4est_ghost_exchange_data_end (p4est_ghost_exchange_t * exc)
+{
+  /* don't confuse this function with p4est_ghost_exchange_custom_end */
+  P4EST_ASSERT (!exc->is_custom);
+  P4EST_ASSERT (!exc->is_levels);
+
+  /* delegate the rest of the work, including freeing the context */
+  exc->is_custom = 1;
+  p4est_ghost_exchange_custom_end (exc);
 }
 
 void
@@ -2342,21 +2374,41 @@ p4est_ghost_exchange_custom (p4est_t * p4est, p4est_ghost_t * ghost,
                              size_t data_size,
                              void **mirror_data, void *ghost_data)
 {
+  p4est_ghost_exchange_custom_end (p4est_ghost_exchange_custom_begin
+                                   (p4est, ghost, data_size,
+                                    mirror_data, ghost_data));
+}
+
+p4est_ghost_exchange_t *
+p4est_ghost_exchange_custom_begin (p4est_t * p4est, p4est_ghost_t * ghost,
+                                   size_t data_size,
+                                   void **mirror_data, void *ghost_data)
+{
   const int           num_procs = p4est->mpisize;
   int                 mpiret;
   int                 q;
   char               *mem, **sbuf;
-  size_t              zz;
-  sc_array_t          requests, sbuffers;
   p4est_locidx_t      ng_excl, ng_incl, ng, theg;
   p4est_locidx_t      mirr;
+  p4est_ghost_exchange_t *exc;
   sc_MPI_Request     *r;
 
+  /* initialize transient storage */
+  exc = P4EST_ALLOC_ZERO (p4est_ghost_exchange_t, 1);
+  exc->is_custom = 1;
+  exc->p4est = p4est;
+  exc->ghost = ghost;
+  exc->minlevel = 0;
+  exc->maxlevel = P4EST_QMAXLEVEL;
+  exc->data_size = data_size;
+  exc->ghost_data = ghost_data;
+  sc_array_init (&exc->requests, sizeof (sc_MPI_Request));
+  sc_array_init (&exc->sbuffers, sizeof (char *));
+
+  /* return early if there is nothing to do */
   if (data_size == 0) {
-    return;
+    return exc;
   }
-  sc_array_init (&requests, sizeof (sc_MPI_Request));
-  sc_array_init (&sbuffers, sizeof (char *));
 
   /* receive data from other processors */
   ng_excl = 0;
@@ -2365,7 +2417,7 @@ p4est_ghost_exchange_custom (p4est_t * p4est, p4est_ghost_t * ghost,
     ng = ng_incl - ng_excl;
     P4EST_ASSERT (ng >= 0);
     if (ng > 0) {
-      r = (sc_MPI_Request *) sc_array_push (&requests);
+      r = (sc_MPI_Request *) sc_array_push (&exc->requests);
       mpiret = sc_MPI_Irecv ((char *) ghost_data + ng_excl * data_size,
                              ng * data_size, sc_MPI_BYTE, q,
                              P4EST_COMM_GHOST_EXCHANGE, p4est->mpicomm, r);
@@ -2383,7 +2435,7 @@ p4est_ghost_exchange_custom (p4est_t * p4est, p4est_ghost_t * ghost,
     P4EST_ASSERT (ng >= 0);
     if (ng > 0) {
       /* every peer populates its own send buffer */
-      sbuf = (char **) sc_array_push (&sbuffers);
+      sbuf = (char **) sc_array_push (&exc->sbuffers);
       mem = *sbuf = P4EST_ALLOC (char, ng * data_size);
       for (theg = 0; theg < ng; ++theg) {
         mirr = ghost->mirror_proc_mirrors[ng_excl + theg];
@@ -2391,7 +2443,7 @@ p4est_ghost_exchange_custom (p4est_t * p4est, p4est_ghost_t * ghost,
         memcpy (mem, mirror_data[mirr], data_size);
         mem += data_size;
       }
-      r = (sc_MPI_Request *) sc_array_push (&requests);
+      r = (sc_MPI_Request *) sc_array_push (&exc->requests);
       mpiret = sc_MPI_Isend (*sbuf, ng * data_size, sc_MPI_BYTE, q,
                              P4EST_COMM_GHOST_EXCHANGE, p4est->mpicomm, r);
       SC_CHECK_MPI (mpiret);
@@ -2399,16 +2451,36 @@ p4est_ghost_exchange_custom (p4est_t * p4est, p4est_ghost_t * ghost,
     }
   }
 
-  /* wait and clean up */
-  mpiret = sc_MPI_Waitall (requests.elem_count, (sc_MPI_Request *)
-                           requests.array, sc_MPI_STATUSES_IGNORE);
+  /* we are done posting the messages */
+  return exc;
+}
+
+void
+p4est_ghost_exchange_custom_end (p4est_ghost_exchange_t * exc)
+{
+  int                 mpiret;
+  size_t              zz;
+  char              **sbuf;
+
+  /* don't confuse this function with p4est_ghost_exchange_data_end */
+  P4EST_ASSERT (exc->is_custom);
+
+  /* don't confuse it with p4est_ghost_exchange_custom_levels_end either */
+  P4EST_ASSERT (!exc->is_levels);
+
+  /* wait for messages to complete and clean up */
+  mpiret = sc_MPI_Waitall (exc->requests.elem_count, (sc_MPI_Request *)
+                           exc->requests.array, sc_MPI_STATUSES_IGNORE);
   SC_CHECK_MPI (mpiret);
-  sc_array_reset (&requests);
-  for (zz = 0; zz < sbuffers.elem_count; ++zz) {
-    sbuf = (char **) sc_array_index (&sbuffers, zz);
+  sc_array_reset (&exc->requests);
+  for (zz = 0; zz < exc->sbuffers.elem_count; ++zz) {
+    sbuf = (char **) sc_array_index (&exc->sbuffers, zz);
     P4EST_FREE (*sbuf);
   }
-  sc_array_reset (&sbuffers);
+  sc_array_reset (&exc->sbuffers);
+
+  /* free the store */
+  P4EST_FREE (exc);
 }
 
 void
@@ -2417,37 +2489,65 @@ p4est_ghost_exchange_custom_levels (p4est_t * p4est, p4est_ghost_t * ghost,
                                     size_t data_size,
                                     void **mirror_data, void *ghost_data)
 {
+  p4est_ghost_exchange_custom_levels_end
+    (p4est_ghost_exchange_custom_levels_begin (p4est, ghost,
+                                               minlevel, maxlevel, data_size,
+                                               mirror_data, ghost_data));
+}
+
+p4est_ghost_exchange_t *
+p4est_ghost_exchange_custom_levels_begin (p4est_t * p4est,
+                                          p4est_ghost_t * ghost,
+                                          int minlevel, int maxlevel,
+                                          size_t data_size,
+                                          void **mirror_data,
+                                          void *ghost_data)
+{
   const int           num_procs = p4est->mpisize;
   int                 mpiret;
   int                 q;
-  int                 i, expected, remaining, received, *peers;
   int                *theq, *qactive, *qbuffer;
   char               *mem, **rbuf, **sbuf;
-  size_t              zz;
-  sc_array_t          rrequests, srequests, rbuffers, sbuffers;
   p4est_locidx_t      ng_excl, ng_incl, ng, theg;
   p4est_locidx_t      lmatches;
   p4est_locidx_t      mirr;
   p4est_quadrant_t   *g, *m;
+  p4est_ghost_exchange_t *exc;
   sc_MPI_Request     *r;
 
   if (minlevel <= 0 && maxlevel >= P4EST_QMAXLEVEL) {
-    /* this saves a copy for the ghost quadrants */
-    p4est_ghost_exchange_custom (p4est, ghost,
-                                 data_size, mirror_data, ghost_data);
-    return;
-  }
+    /* this case can be processed by a more specialized function */
+    exc = p4est_ghost_exchange_custom_begin (p4est, ghost, data_size,
+                                             mirror_data, ghost_data);
+    P4EST_ASSERT (exc->is_custom);
+    P4EST_ASSERT (!exc->is_levels);
+    exc->is_levels = 1;
+
+    /* the completion function will have to switch for this case */
+    return exc;
+  }
+
+  /* initialize transient storage */
+  exc = P4EST_ALLOC_ZERO (p4est_ghost_exchange_t, 1);
+  exc->is_custom = 1;
+  exc->is_levels = 1;
+  exc->p4est = p4est;
+  exc->ghost = ghost;
+  exc->minlevel = minlevel;
+  exc->maxlevel = maxlevel;
+  exc->data_size = data_size;
+  exc->ghost_data = ghost_data;
+  sc_array_init (&exc->requests, sizeof (sc_MPI_Request));
+  sc_array_init (&exc->rrequests, sizeof (sc_MPI_Request));
+  sc_array_init (&exc->rbuffers, sizeof (char *));
+  sc_array_init (&exc->sbuffers, sizeof (char *));
+
+  /* return early if there is nothing to do */
   if (data_size == 0 || minlevel > maxlevel) {
-    /* nothing to do */
-    return;
+    return exc;
   }
-
-  sc_array_init (&rrequests, sizeof (sc_MPI_Request));
-  sc_array_init (&srequests, sizeof (sc_MPI_Request));
-  sc_array_init (&rbuffers, sizeof (char *));
-  sc_array_init (&sbuffers, sizeof (char *));
-  qactive = P4EST_ALLOC (int, num_procs);
-  qbuffer = P4EST_ALLOC (int, num_procs);
+  qactive = exc->qactive = P4EST_ALLOC (int, num_procs);
+  qbuffer = exc->qbuffer = P4EST_ALLOC (int, num_procs);
 
   /* receive data from other processors */
   ng_excl = 0;
@@ -2467,13 +2567,13 @@ p4est_ghost_exchange_custom_levels (p4est_t * p4est, p4est_ghost_t * ghost,
         }
       }
       if (lmatches > 0) {
-        theq = qactive + rrequests.elem_count;
-        r = (sc_MPI_Request *) sc_array_push (&rrequests);
+        theq = qactive + exc->rrequests.elem_count;
+        r = (sc_MPI_Request *) sc_array_push (&exc->rrequests);
         if (lmatches < ng) {
           /* every peer populates its own receive buffer */
           *theq = q;
-          qbuffer[q] = (int) rbuffers.elem_count;
-          rbuf = (char **) sc_array_push (&rbuffers);
+          qbuffer[q] = (int) exc->rbuffers.elem_count;
+          rbuf = (char **) sc_array_push (&exc->rbuffers);
           *rbuf = P4EST_ALLOC (char, lmatches * data_size);
           mpiret = sc_MPI_Irecv (*rbuf, lmatches * data_size, sc_MPI_BYTE, q,
                                  P4EST_COMM_GHOST_EXCHANGE, p4est->mpicomm,
@@ -2512,7 +2612,7 @@ p4est_ghost_exchange_custom_levels (p4est_t * p4est, p4est_ghost_t * ghost,
       }
       if (lmatches > 0) {
         /* every peer populates its own send buffer */
-        sbuf = (char **) sc_array_push (&sbuffers);
+        sbuf = (char **) sc_array_push (&exc->sbuffers);
         mem = *sbuf = P4EST_ALLOC (char, lmatches * data_size);
         for (theg = 0; theg < ng; ++theg) {
           mirr = ghost->mirror_proc_mirrors[ng_excl + theg];
@@ -2522,7 +2622,7 @@ p4est_ghost_exchange_custom_levels (p4est_t * p4est, p4est_ghost_t * ghost,
             mem += data_size;
           }
         }
-        r = (sc_MPI_Request *) sc_array_push (&srequests);
+        r = (sc_MPI_Request *) sc_array_push (&exc->requests);
         mpiret = sc_MPI_Isend (*sbuf, lmatches * data_size, sc_MPI_BYTE, q,
                                P4EST_COMM_GHOST_EXCHANGE, p4est->mpicomm, r);
         SC_CHECK_MPI (mpiret);
@@ -2531,18 +2631,55 @@ p4est_ghost_exchange_custom_levels (p4est_t * p4est, p4est_ghost_t * ghost,
     }
   }
 
+  /* we are done posting messages */
+  return exc;
+}
+
+void
+p4est_ghost_exchange_custom_levels_end (p4est_ghost_exchange_t * exc)
+{
+  p4est_ghost_t      *ghost = exc->ghost;
+#ifdef P4EST_ENABLE_DEBUG
+  p4est_t            *p4est = exc->p4est;
+  const int           num_procs = p4est->mpisize;
+#endif
+  const int           minlevel = exc->minlevel;
+  const int           maxlevel = exc->maxlevel;
+  const size_t        data_size = exc->data_size;
+  int                 mpiret;
+  int                 i, expected, remaining, received, *peers;
+  int                 q;
+  char              **rbuf, **sbuf;
+  size_t              zz;
+  p4est_locidx_t      ng_excl, ng_incl, ng, theg;
+  p4est_locidx_t      lmatches;
+  p4est_quadrant_t   *g;
+
+  /* make sure that the begin function matches the end function */
+  P4EST_ASSERT (exc->is_custom);
+  P4EST_ASSERT (exc->is_levels);
+
+  /* check whether we have used the specialized function */
+  if (minlevel <= 0 && maxlevel >= P4EST_QMAXLEVEL) {
+    exc->is_levels = 0;
+    p4est_ghost_exchange_custom_end (exc);
+    return;
+  }
+
   /* wait for receives and copy data into the proper result array */
-  peers = P4EST_ALLOC (int, rrequests.elem_count);
-  expected = remaining = (int) rrequests.elem_count;
+  peers = P4EST_ALLOC (int, exc->rrequests.elem_count);
+  expected = remaining = (int) exc->rrequests.elem_count;
   while (remaining > 0) {
-    mpiret = sc_MPI_Waitsome (expected, (sc_MPI_Request *) rrequests.array,
-                              &received, peers, sc_MPI_STATUSES_IGNORE);
+    mpiret =
+      sc_MPI_Waitsome (expected, (sc_MPI_Request *) exc->rrequests.array,
+                       &received, peers, sc_MPI_STATUSES_IGNORE);
     SC_CHECK_MPI (mpiret);
     P4EST_ASSERT (received != sc_MPI_UNDEFINED);
     P4EST_ASSERT (received > 0);
     for (i = 0; i < received; ++i) {
-      P4EST_ASSERT (0 <= peers[i] && peers[i] < (int) rrequests.elem_count);
-      q = qactive[peers[i]];
+      P4EST_ASSERT (0 <= peers[i] &&
+                    peers[i] < (int) exc->rrequests.elem_count);
+      q = exc->qactive[peers[i]];
       if (q >= 0) {
         P4EST_ASSERT (q != p4est->mpirank && q < num_procs);
         ng_excl = ghost->proc_offsets[q];
@@ -2550,38 +2687,41 @@ p4est_ghost_exchange_custom_levels (p4est_t * p4est, p4est_ghost_t * ghost,
         ng = ng_incl - ng_excl;
         P4EST_ASSERT (ng > 0);
         /* run through ghosts to copy the matching level quadrants' data */
-        rbuf = (char **) sc_array_index_int (&rbuffers, qbuffer[q]);
+        rbuf = (char **) sc_array_index_int (&exc->rbuffers, exc->qbuffer[q]);
         for (lmatches = 0, theg = 0; theg < ng; ++theg) {
           g = p4est_quadrant_array_index (&ghost->ghosts, ng_excl + theg);
           if (minlevel <= (int) g->level && (int) g->level <= maxlevel) {
-            memcpy ((char *) ghost_data + (ng_excl + theg) * data_size,
+            memcpy ((char *) exc->ghost_data + (ng_excl + theg) * data_size,
                     *rbuf + lmatches * data_size, data_size);
             ++lmatches;
           }
         }
         P4EST_FREE (*rbuf);
-        qactive[peers[i]] = -1;
-        qbuffer[q] = -1;
+        exc->qactive[peers[i]] = -1;
+        exc->qbuffer[q] = -1;
       }
     }
     remaining -= received;
   }
   P4EST_FREE (peers);
-  P4EST_FREE (qactive);
-  P4EST_FREE (qbuffer);
-  sc_array_reset (&rrequests);
-  sc_array_reset (&rbuffers);
+  P4EST_FREE (exc->qactive);
+  P4EST_FREE (exc->qbuffer);
+  sc_array_reset (&exc->rrequests);
+  sc_array_reset (&exc->rbuffers);
 
   /* wait for sends and clean up */
-  mpiret = sc_MPI_Waitall (srequests.elem_count, (sc_MPI_Request *)
-                           srequests.array, sc_MPI_STATUSES_IGNORE);
+  mpiret = sc_MPI_Waitall (exc->requests.elem_count, (sc_MPI_Request *)
+                           exc->requests.array, sc_MPI_STATUSES_IGNORE);
   SC_CHECK_MPI (mpiret);
-  sc_array_reset (&srequests);
-  for (zz = 0; zz < sbuffers.elem_count; ++zz) {
-    sbuf = (char **) sc_array_index (&sbuffers, zz);
+  sc_array_reset (&exc->requests);
+  for (zz = 0; zz < exc->sbuffers.elem_count; ++zz) {
+    sbuf = (char **) sc_array_index (&exc->sbuffers, zz);
     P4EST_FREE (*sbuf);
   }
-  sc_array_reset (&sbuffers);
+  sc_array_reset (&exc->sbuffers);
+
+  /* free temporary storage */
+  P4EST_FREE (exc);
 }
 
 #ifdef P4EST_ENABLE_MPI
@@ -2739,8 +2879,7 @@ p4est_ghost_expand_kernel (p4est_topidx_t t, p4est_quadrant_t * mq,
                                           NULL, conn);
 
       for (zy = 0; zy < tempquads->elem_count; zy++) {
-        p4est_topidx_t      nnt =
-          *((p4est_topidx_t *) sc_array_index (temptrees, zy));
+        nnt = *((p4est_topidx_t *) sc_array_index (temptrees, zy));
 
         if (nnt == t) {
           p4est_quadrant_t   *tempq =
@@ -2765,8 +2904,7 @@ p4est_ghost_expand_kernel (p4est_topidx_t t, p4est_quadrant_t * mq,
                                             temptrees, NULL, conn);
 
       for (zy = 0; zy < tempquads->elem_count; zy++) {
-        p4est_topidx_t      nnt =
-          *((p4est_topidx_t *) sc_array_index (temptrees, zy));
+        nnt = *((p4est_topidx_t *) sc_array_index (temptrees, zy));
 
         if (nnt == t) {
           p4est_quadrant_t   *tempq =
@@ -2921,7 +3059,7 @@ p4est_ghost_expand_internal (p4est_t * p4est, p4est_lnodes_t * lnodes,
     }
 
     num_mirrors = (p4est_locidx_t) ghost->mirrors.elem_count;
-    for (il = 0; (size_t) il < num_mirrors; il++) {
+    for (il = 0; il < num_mirrors; il++) {
       p4est_quadrant_t   *q;
 
       q = p4est_quadrant_array_index (&ghost->mirrors, il);
@@ -3162,7 +3300,8 @@ p4est_ghost_expand_internal (p4est_t * p4est, p4est_lnodes_t * lnodes,
 #endif
       p4est_quadrant_t   *mq = p4est_quadrant_array_index (mirrors,
                                                            (size_t) mpf[zm]);
-      p4est_locidx_t      t = mq->p.piggy3.which_tree;
+
+      t = mq->p.piggy3.which_tree;
 
       if (lnodes) {
         /* construct adjacency via lnodes */
@@ -3631,15 +3770,16 @@ p4est_ghost_expand_internal (p4est_t * p4est, p4est_lnodes_t * lnodes,
   for (p = 0; p < mpisize; p++) {
     /* add all of the potentially new mirrors */
     buf = (sc_array_t *) sc_array_index_int (send_bufs, p);
-    size_t              oldsize;
 
     if (!buf->elem_count) {
       continue;
     }
-    oldsize = new_mirrors->elem_count;
-    sc_array_resize (new_mirrors, oldsize + buf->elem_count);
-    memcpy (new_mirrors->array + oldsize * new_mirrors->elem_size,
-            buf->array, buf->elem_count * buf->elem_size);
+    else {
+      size_t              oldsize = new_mirrors->elem_count;
+      sc_array_resize (new_mirrors, oldsize + buf->elem_count);
+      memcpy (new_mirrors->array + oldsize * new_mirrors->elem_size,
+              buf->array, buf->elem_count * buf->elem_size);
+    }
   }
   sc_array_sort (new_mirrors, p4est_quadrant_compare_piggy);
   sc_array_uniq (new_mirrors, p4est_quadrant_compare_piggy);
@@ -3702,8 +3842,8 @@ p4est_ghost_expand_internal (p4est_t * p4est, p4est_lnodes_t * lnodes,
 
     P4EST_LDEBUGF
       ("ghost layer expanded with proc %d: send %lld receive %lld\n",
-       p, (long long) old_count + frontsize,
-       (long long) proc_offsets[p + 1] - proc_offsets[p]);
+       p, (long long) (old_count + frontsize),
+       (long long) (proc_offsets[p + 1] - proc_offsets[p]));
     sc_array_resize (nmpma, offset + old_count + frontsize);
     memcpy (nmpma->array + nmpma->elem_size * offset,
             mpf + mpfo[p], sizeof (p4est_locidx_t) * frontsize);
diff --git a/src/p4est_ghost.h b/src/p4est_ghost.h
index 23abb4d..5c74baa 100644
--- a/src/p4est_ghost.h
+++ b/src/p4est_ghost.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -89,7 +90,8 @@ p4est_ghost_t;
  * \param [in] ghost    Ghost layer structure.
  * \return true if \a ghost is valid
  */
-int                 p4est_ghost_is_valid (p4est_t *p4est, p4est_ghost_t * ghost);
+int                 p4est_ghost_is_valid (p4est_t * p4est,
+                                          p4est_ghost_t * ghost);
 
 /** Calculate the memory usage of the ghost layer.
  * \param [in] ghost    Ghost layer structure.
@@ -250,6 +252,41 @@ void                p4est_ghost_exchange_data (p4est_t * p4est,
                                                p4est_ghost_t * ghost,
                                                void *ghost_data);
 
+/** Transient storage for asynchronous ghost exchange. */
+typedef struct p4est_ghost_exchange
+{
+  int                 is_custom;        /**< False for p4est_ghost_exchange_data */
+  int                 is_levels;        /**< Are we restricted to levels or not */
+  p4est_t            *p4est;
+  p4est_ghost_t      *ghost;
+  int                 minlevel, maxlevel;       /**< Meaningful with is_levels */
+  size_t              data_size;
+  void               *ghost_data;
+  int                *qactive, *qbuffer;
+  sc_array_t          requests, sbuffers;
+  sc_array_t          rrequests, rbuffers;
+}
+p4est_ghost_exchange_t;
+
+/** Begin an asynchronous ghost data exchange by posting messages.
+ * The arguments are identical to p4est_ghost_exchange_data.
+ * The return type is always non-NULL and must be passed to
+ * p4est_ghost_exchange_data_end to complete the exchange.
+ * The ghost data must not be accessed before completion.
+ * \param [in,out]  ghost_data  Must stay alive into the completion call.
+ * \return          Transient storage for messages in progress.
+ */
+p4est_ghost_exchange_t *p4est_ghost_exchange_data_begin
+  (p4est_t * p4est, p4est_ghost_t * ghost, void *ghost_data);
+
+/** Complete an asynchronous ghost data exchange.
+ * This function waits for all pending MPI communications.
+ * \param [in,out]  Data created ONLY by p4est_ghost_exchange_data_begin.
+ *                  It is deallocated before this function returns.
+ */
+void                p4est_ghost_exchange_data_end
+  (p4est_ghost_exchange_t * exc);
+
 /** Transfer data for local quadrants that are ghosts to other processors.
  * The data size is the same for all quadrants and can be chosen arbitrarily.
  * \param [in] p4est            The forest used for reference.
@@ -266,6 +303,29 @@ void                p4est_ghost_exchange_custom (p4est_t * p4est,
                                                  void **mirror_data,
                                                  void *ghost_data);
 
+/** Begin an asynchronous ghost data exchange by posting messages.
+ * The arguments are identical to p4est_ghost_exchange_custom.
+ * The return type is always non-NULL and must be passed to
+ * p4est_ghost_exchange_custom_end to complete the exchange.
+ * The ghost data must not be accessed before completion.
+ * The mirror data can be safely discarded right after this function returns
+ * since it is copied into internal send buffers.
+ * \param [in]      mirror_data Not required to stay alive any longer.
+ * \param [in,out]  ghost_data  Must stay alive into the completion call.
+ * \return          Transient storage for messages in progress.
+ */
+p4est_ghost_exchange_t *p4est_ghost_exchange_custom_begin
+  (p4est_t * p4est, p4est_ghost_t * ghost,
+   size_t data_size, void **mirror_data, void *ghost_data);
+
+/** Complete an asynchronous ghost data exchange.
+ * This function waits for all pending MPI communications.
+ * \param [in,out]  Data created ONLY by p4est_ghost_exchange_custom_begin.
+ *                  It is deallocated before this function returns.
+ */
+void                p4est_ghost_exchange_custom_end
+  (p4est_ghost_exchange_t * exc);
+
 /** Transfer data for local quadrants that are ghosts to other processors.
  * The data size is the same for all quadrants and can be chosen arbitrarily.
  * This function restricts the transfer to a range of refinement levels.
@@ -290,6 +350,29 @@ void                p4est_ghost_exchange_custom_levels (p4est_t * p4est,
                                                         void **mirror_data,
                                                         void *ghost_data);
 
+/** Begin an asynchronous ghost data exchange by posting messages.
+ * The arguments are identical to p4est_ghost_exchange_custom_levels.
+ * The return type is always non-NULL and must be passed to
+ * p4est_ghost_exchange_custom_levels_end to complete the exchange.
+ * The ghost data must not be accessed before completion.
+ * The mirror data can be safely discarded right after this function returns
+ * since it is copied into internal send buffers.
+ * \param [in]      mirror_data Not required to stay alive any longer.
+ * \param [in,out]  ghost_data  Must stay alive into the completion call.
+ * \return          Transient storage for messages in progress.
+ */
+p4est_ghost_exchange_t *p4est_ghost_exchange_custom_levels_begin
+  (p4est_t * p4est, p4est_ghost_t * ghost, int minlevel, int maxlevel,
+   size_t data_size, void **mirror_data, void *ghost_data);
+
+/** Complete an asynchronous ghost data exchange.
+ * This function waits for all pending MPI communications.
+ * \param [in,out]  Data created ONLY by p4est_ghost_exchange_custom_levels_begin.
+ *                  It is deallocated before this function returns.
+ */
+void                p4est_ghost_exchange_custom_levels_end
+  (p4est_ghost_exchange_t * exc);
+
 /** Expand the size of the ghost layer and mirrors by one additional layer of
  * adjacency.
  * \param [in] p4est            The forest from which the ghost layer was
@@ -299,7 +382,6 @@ void                p4est_ghost_exchange_custom_levels (p4est_t * p4est,
 void                p4est_ghost_expand (p4est_t * p4est,
                                         p4est_ghost_t * ghost);
 
-
 SC_EXTERN_C_END;
 
 #endif /* !P4EST_GHOST_H */
diff --git a/src/p4est_io.c b/src/p4est_io.c
index e170830..39860aa 100644
--- a/src/p4est_io.c
+++ b/src/p4est_io.c
@@ -4,7 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
-  Copyright (C) 2012 Carsten Burstedde
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -92,7 +92,6 @@ p4est_inflate (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
 {
   const p4est_gloidx_t *gfq;
   int                 i;
-  int                 mpiret;
   int                 num_procs, rank;
   p4est_topidx_t      num_trees, jt;
   p4est_gloidx_t      gkey, gtreeskip, gtreeremain, gquadremain;
@@ -120,24 +119,20 @@ p4est_inflate (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
   /* data may be NULL, in this case p4est->data_size will be 0 */
   /* user_pointer may be anything, we don't look at it */
 
-  /* retrieve MPI information */
-  mpiret = sc_MPI_Comm_size (mpicomm, &num_procs);
-  SC_CHECK_MPI (mpiret);
-  mpiret = sc_MPI_Comm_rank (mpicomm, &rank);
-  SC_CHECK_MPI (mpiret);
-
-  /* assign some data members */
+  /* create p4est object and assign some data members */
   p4est = P4EST_ALLOC_ZERO (p4est_t, 1);
-  p4est->mpicomm = mpicomm;
-  p4est->mpisize = num_procs;
-  p4est->mpirank = rank;
-  dsize = p4est->data_size = data == NULL ? 0 : data->elem_size;
+  dsize = p4est->data_size = (data == NULL ? 0 : data->elem_size);
   dap = (char *) (data == NULL ? NULL : data->array);
   qap = (p4est_locidx_t *) quadrants->array;
   p4est->user_pointer = user_pointer;
   p4est->connectivity = connectivity;
   num_trees = connectivity->num_trees;
 
+  /* set parallel environment */
+  p4est_comm_parallel_env_assign (p4est, mpicomm);
+  num_procs = p4est->mpisize;
+  rank = p4est->mpirank;
+
   /* create global first quadrant offsets */
   gfq = p4est->global_first_quadrant =
     P4EST_ALLOC (p4est_gloidx_t, num_procs + 1);
@@ -261,6 +256,7 @@ p4est_inflate (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
   P4EST_VERBOSEF ("total local quadrants %lld\n",
                   (long long) p4est->local_num_quadrants);
 
+  P4EST_ASSERT (p4est->revision == 0);
   P4EST_ASSERT (p4est_is_valid (p4est));
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTION ("Done " P4EST_STRING "_inflate\n");
diff --git a/src/p4est_io.h b/src/p4est_io.h
index dfc360d..4f4cecb 100644
--- a/src/p4est_io.h
+++ b/src/p4est_io.h
@@ -4,7 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
-  Copyright (C) 2012 Carsten Burstedde
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -41,6 +41,7 @@ sc_array_t         *p4est_deflate_quadrants (p4est_t * p4est,
                                              sc_array_t ** data);
 
 /** Create a new p4est based on serialized data.
+ * Its revision counter is set to zero.
  * See p4est.h and p4est_communication.h for more information on parameters.
  * \param [in] mpicomm       A valid MPI communicator.
  * \param [in] connectivity  This is the connectivity information that
@@ -55,7 +56,7 @@ sc_array_t         *p4est_deflate_quadrants (p4est_t * p4est,
  *                           The elem_size of this array informs data_size.
  *                           Its elem_count equals the number of local quads.
  * \param [in] user_pointer  Assign to the user_pointer member of the p4est.
- * \return              The newly created p4est.
+ * \return              The newly created p4est with a zero revision counter.
  */
 p4est_t            *p4est_inflate (sc_MPI_Comm mpicomm,
                                    p4est_connectivity_t * connectivity,
diff --git a/src/p4est_iterate.c b/src/p4est_iterate.c
index f26e48c..1294bea 100644
--- a/src/p4est_iterate.c
+++ b/src/p4est_iterate.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_iterate.h b/src/p4est_iterate.h
index 4d43a50..5a56224 100644
--- a/src/p4est_iterate.h
+++ b/src/p4est_iterate.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -246,7 +247,7 @@ p4est_iter_cside_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p4est_iter_corner_side_t *)
-    (array->array + sizeof (p4est_iter_corner_side_t) * it);
+    (array->array + sizeof (p4est_iter_corner_side_t) * (size_t) it);
 }
 
 /** Return a pointer to a iter_corner_side array element indexed by a size_t.
@@ -272,7 +273,7 @@ p4est_iter_fside_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p4est_iter_face_side_t *)
-    (array->array + sizeof (p4est_iter_face_side_t) * it);
+    (array->array + sizeof (p4est_iter_face_side_t) * (size_t) it);
 }
 
 /** Return a pointer to a iter_face_side array element indexed by a size_t.
diff --git a/src/p4est_lnodes.c b/src/p4est_lnodes.c
index ffe79d2..3b865ce 100644
--- a/src/p4est_lnodes.c
+++ b/src/p4est_lnodes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,6 +22,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
+#include <sc_statistics.h>
 #ifndef P4_TO_P8
 #include <p4est_bits.h>
 #include <p4est_communication.h>
@@ -35,6 +37,10 @@
 #include <p8est_lnodes.h>
 #endif
 
+#ifdef P4EST_ENABLE_DEBUG
+#include <sc_statistics.h>
+#endif
+
 #ifndef P4_TO_P8
 #define P4EST_LN_C_OFFSET 4
 #else
@@ -1236,7 +1242,6 @@ p8est_lnodes_edge_callback (p8est_iter_edge_info_t * info, void *Data)
         z = q->z + h * ((c & 4) >> 2);
       }
       else {
-        int                 c2;
         p4est_topidx_t      nnt;
 
         P4EST_ASSERT (nt == tid);
@@ -2036,7 +2041,7 @@ p4est_lnodes_count_send (p4est_lnodes_data_t * data, p4est_t * p4est,
       total_sent += (send_count * sizeof (p4est_locidx_t));
     }
   }
-  P4EST_VERBOSEF ("Total of %lld bytes sent to %d processes\n",
+  P4EST_VERBOSEF ("Total of %llu bytes sent to %d processes\n",
                   (unsigned long long) total_sent, num_send_procs);
 }
 
@@ -2287,7 +2292,7 @@ p4est_lnodes_recv (p4est_t * p4est, p4est_lnodes_data_t * data,
     sc_array_reset (&(recv_buf[i]));
   }
 
-  P4EST_VERBOSEF ("Total of %lld bytes received from %d processes\n",
+  P4EST_VERBOSEF ("Total of %llu bytes received from %d processes\n",
                   (unsigned long long) total_recv, num_recv_procs);
   P4EST_FREE (data->send_buf);
   P4EST_FREE (recv_buf);
@@ -2642,6 +2647,17 @@ p4est_lnodes_new (p4est_t * p4est, p4est_ghost_t * ghost_layer, int degree)
 
   p4est_lnodes_reset_data (&data, p4est);
 
+#ifdef P4EST_ENABLE_DEBUG
+  {
+    sc_statinfo_t       nodestat;
+
+    sc_stats_set1 (&nodestat, (double) lnodes->owned_count,
+                   "Nodes per processor");
+    sc_stats_compute (p4est->mpicomm, 1, &nodestat);
+    sc_stats_print (p4est_package_id, SC_LP_STATISTICS, 1, &nodestat, 1, 0);
+  }
+#endif
+
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTIONF ("Done " P4EST_STRING "_lnodes_new with"
                             " %lld global nodes\n",
@@ -3232,6 +3248,234 @@ p4est_ghost_support_lnodes (p4est_t * p4est, p4est_lnodes_t * lnodes,
 #endif
 }
 
+typedef struct p4est_part_lnodes
+{
+  int                 nodes_per_corner;
+  int                 nodes_per_edge;
+  int                 nodes_per_face;
+  int                 nodes_per_volume;
+  int                *weights;
+  int                 count;
+}
+p4est_part_lnodes_t;
+
+static void
+p4est_lnodes_count_corner (p4est_iter_corner_info_t * info, void *user_data)
+{
+  p4est_part_lnodes_t *part = (p4est_part_lnodes_t *) user_data;
+  p4est_iter_corner_side_t *side;
+
+  side = p4est_iter_cside_array_index (&info->sides, 0);
+
+  if (!side->is_ghost) {
+    p4est_topidx_t      t;
+    p4est_tree_t       *tree;
+    p4est_locidx_t      offset;
+    p4est_locidx_t      quadid;
+
+    t = side->treeid;
+    tree = p4est_tree_array_index (info->p4est->trees, t);
+    offset = tree->quadrants_offset;
+    quadid = side->quadid + offset;
+    part->weights[quadid] += part->nodes_per_corner;
+  }
+}
+
+#ifdef P4_TO_P8
+static void
+p8est_lnodes_count_edge (p8est_iter_edge_info_t * info, void *user_data)
+{
+  p4est_part_lnodes_t *part = (p4est_part_lnodes_t *) user_data;
+  p8est_iter_edge_side_t *side;
+  int                 is_ghost;
+  p4est_locidx_t      quadid;
+  p4est_topidx_t      t;
+  p4est_tree_t       *tree;
+  p4est_locidx_t      offset;
+
+  side = p8est_iter_eside_array_index (&info->sides, 0);
+
+  t = side->treeid;
+  tree = p4est_tree_array_index (info->p4est->trees, t);
+  offset = tree->quadrants_offset;
+  if (!side->is_hanging) {
+    is_ghost = side->is.full.is_ghost;
+    quadid = side->is.full.quadid;
+  }
+  else {
+    is_ghost = side->is.hanging.is_ghost[0];
+    quadid = side->is.hanging.quadid[0];
+  }
+
+  if (!is_ghost) {
+    quadid += offset;
+    part->weights[quadid] += part->nodes_per_edge;
+  }
+}
+#endif
+
+static void
+p4est_lnodes_count_face (p4est_iter_face_info_t * info, void *user_data)
+{
+  p4est_part_lnodes_t *part = (p4est_part_lnodes_t *) user_data;
+  p4est_iter_face_side_t *side;
+  int                 is_ghost;
+  p4est_locidx_t      quadid;
+  p4est_topidx_t      t;
+  p4est_tree_t       *tree;
+  p4est_locidx_t      offset;
+
+  side = p4est_iter_fside_array_index (&info->sides, 0);
+
+  t = side->treeid;
+  tree = p4est_tree_array_index (info->p4est->trees, t);
+  offset = tree->quadrants_offset;
+  if (!side->is_hanging) {
+    is_ghost = side->is.full.is_ghost;
+    quadid = side->is.full.quadid;
+  }
+  else {
+    is_ghost = side->is.hanging.is_ghost[0];
+    quadid = side->is.hanging.quadid[0];
+  }
+
+  if (!is_ghost) {
+    quadid += offset;
+    part->weights[quadid] += part->nodes_per_face;
+  }
+}
+
+static void
+p4est_lnodes_count_volume (p4est_iter_volume_info_t * info, void *user_data)
+{
+  p4est_part_lnodes_t *part = (p4est_part_lnodes_t *) user_data;
+  p4est_locidx_t      quadid;
+  p4est_topidx_t      t;
+  p4est_tree_t       *tree;
+  p4est_locidx_t      offset;
+
+  t = info->treeid;
+  tree = p4est_tree_array_index (info->p4est->trees, t);
+  offset = tree->quadrants_offset;
+
+  quadid = info->quadid + offset;
+  part->weights[quadid] += part->nodes_per_volume;
+}
+
+static int
+p4est_lnodes_weight (p4est_t * p4est, p4est_topidx_t which_tree,
+                     p4est_quadrant_t * quadrant)
+{
+  p4est_part_lnodes_t *part = (p4est_part_lnodes_t *) p4est->user_pointer;
+  int                 count = part->count;
+  int                 weight = part->weights[count];
+
+  part->count++;
+
+  return weight;
+}
+
+void
+p4est_partition_lnodes_detailed (p4est_t * p4est, p4est_ghost_t * ghost,
+                                 int nodes_per_volume, int nodes_per_face,
+#ifdef P4_TO_P8
+                                 int nodes_per_edge,
+#endif
+                                 int nodes_per_corner,
+                                 int partition_for_coarsening)
+{
+  int                *weights;
+  int                 ghost_given = (ghost != NULL);
+  p4est_iter_corner_t citer = NULL;
+#ifdef P4_TO_P8
+  p8est_iter_edge_t   eiter = NULL;
+#endif
+  p4est_iter_face_t   fiter = NULL;
+  p4est_iter_volume_t viter = NULL;
+  p4est_part_lnodes_t part;
+  void               *orig_user_pointer = p4est->user_pointer;
+
+  if (!ghost_given) {
+    ghost = p4est_ghost_new (p4est, P4EST_CONNECT_FULL);
+  }
+
+  part.nodes_per_corner = nodes_per_corner;
+#ifdef P4_TO_P8
+  part.nodes_per_edge = nodes_per_edge;
+#endif
+  part.nodes_per_face = nodes_per_face;
+  part.nodes_per_volume = nodes_per_volume;
+
+  if (nodes_per_corner) {
+    citer = p4est_lnodes_count_corner;
+  }
+#ifdef P4_TO_P8
+  if (nodes_per_edge) {
+    eiter = p8est_lnodes_count_edge;
+  }
+#endif
+  if (nodes_per_face) {
+    fiter = p4est_lnodes_count_face;
+  }
+  if (nodes_per_volume) {
+    viter = p4est_lnodes_count_volume;
+  }
+
+  weights = P4EST_ALLOC_ZERO (int, p4est->local_num_quadrants);
+
+  part.weights = weights;
+
+  p4est_iterate (p4est, ghost, &part, viter, fiter,
+#ifdef P4_TO_P8
+                 eiter,
+#endif
+                 citer);
+
+  p4est->user_pointer = ∂
+  part.count = 0;
+
+  p4est_partition_ext (p4est, partition_for_coarsening, p4est_lnodes_weight);
+
+  p4est->user_pointer = orig_user_pointer;
+
+  P4EST_FREE (weights);
+
+  if (!ghost_given) {
+    p4est_ghost_destroy (ghost);
+  }
+}
+
+void
+p4est_partition_lnodes (p4est_t * p4est, p4est_ghost_t * ghost, int degree,
+                        int partition_for_coarsening)
+{
+  int                 nodes_per_volume, nodes_per_face, nodes_per_corner;
+#ifdef P4_TO_P8
+  int                 nodes_per_edge;
+#endif
+
+  P4EST_ASSERT (degree >= 1);
+
+#ifndef P4_TO_P8
+  nodes_per_corner = 1;
+  nodes_per_face = (degree - 1);
+  nodes_per_volume = (degree - 1) * (degree - 1);
+#else
+  nodes_per_corner = 1;
+  nodes_per_edge = (degree - 1);
+  nodes_per_face = (degree - 1) * (degree - 1);
+  nodes_per_volume = (degree - 1) * (degree - 1) * (degree - 1);
+#endif
+
+  p4est_partition_lnodes_detailed (p4est, ghost, nodes_per_volume,
+                                   nodes_per_face,
+#ifdef P4_TO_P8
+                                   nodes_per_edge,
+#endif
+                                   nodes_per_corner,
+                                   partition_for_coarsening);
+}
+
 p4est_lnodes_buffer_t *
 p4est_lnodes_share_owned_begin (sc_array_t * node_data,
                                 p4est_lnodes_t * lnodes)
diff --git a/src/p4est_lnodes.h b/src/p4est_lnodes.h
index 4b90fc4..eea9f8a 100644
--- a/src/p4est_lnodes.h
+++ b/src/p4est_lnodes.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -235,6 +236,31 @@ void                p4est_ghost_expand_by_lnodes (p4est_t * p4est,
                                                   p4est_lnodes_t * lnodes,
                                                   p4est_ghost_t * ghost);
 
+/** Partition using weights based on the number of nodes assigned to each
+ * element in lnodes
+ *
+ * \param[in,out] p4est                    the forest to be repartitioned
+ * \param[in]     ghost                    the ghost layer
+ * \param[in]     degree                   the degree that would be passed to p4est_lnodes_new()
+ * \param[in]     partition_for_coarsening whether the partition should allow
+ *                                         coarsening (i.e. group siblings who
+ *                                         might merge)
+ */
+void                p4est_partition_lnodes (p4est_t * p4est,
+                                            p4est_ghost_t * ghost, int degree,
+                                            int partition_for_coarsening);
+
+/** Partition using weights that are broken down by where they reside: in
+ * volumes, on faces, or on corners.
+ */
+void                p4est_partition_lnodes_detailed (p4est_t * p4est,
+                                                     p4est_ghost_t * ghost,
+                                                     int nodes_per_volume,
+                                                     int nodes_per_face,
+                                                     int nodes_per_corner,
+                                                     int
+                                                     partition_for_coarsening);
+
 /** p4est_lnodes_buffer_t handles the communication of data associated with
  * nodes.
  *
@@ -332,7 +358,7 @@ p4est_lnodes_rank_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p4est_lnodes_rank_t *)
-    (array->array + sizeof (p4est_lnodes_rank_t) * it);
+    (array->array + sizeof (p4est_lnodes_rank_t) * (size_t) it);
 }
 
 /** Return a pointer to a lnodes_rank array element indexed by a size_t.
diff --git a/src/p4est_mesh.c b/src/p4est_mesh.c
index 6948d2c..a33d0aa 100644
--- a/src/p4est_mesh.c
+++ b/src/p4est_mesh.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -103,12 +104,14 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
   int                 visited[P4EST_CHILDREN];
   int8_t             *pccorner;
   size_t              cz, zz;
+  sc_array_t         *trees;
   p4est_locidx_t      qoffset, qid1, qid2;
   p4est_locidx_t      cornerid_offset, cornerid;
   p4est_locidx_t     *pcquad;
   p4est_mesh_t       *mesh = (p4est_mesh_t *) user_data;
   p4est_iter_corner_side_t *side1, *side2;
   p4est_tree_t       *tree1, *tree2;
+  p4est_connectivity_t *conn;
 
   /* Check the case when the corner does not involve neighbors */
   cz = info->sides.elem_count;
@@ -117,6 +120,9 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
   if (cz == 1) {
     return;
   }
+  conn = info->p4est->connectivity;
+  trees = info->p4est->trees;
+  cornerid_offset = mesh->local_num_quadrants + mesh->ghost_num_quadrants;
 
   if (info->tree_boundary == P4EST_CONNECT_FACE) {
     /* This corner is inside an inter-tree face */
@@ -125,7 +131,6 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
       return;
     }
     P4EST_ASSERT (cz == P4EST_CHILDREN);
-    cornerid_offset = mesh->local_num_quadrants + mesh->ghost_num_quadrants;
 
     /* Process a corner in pairs of diagonal inter-tree neighbors */
     memset (visited, 0, P4EST_CHILDREN * sizeof (int));
@@ -147,7 +152,7 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
           f1 = tree_face_quadrant_corner_face (side1->quad, side1->corner);
           fc1 = p4est_corner_face_corners[side1->corner][f1];
           P4EST_ASSERT (0 <= fc1 && fc1 < P4EST_HALF);
-          tree1 = p4est_tree_array_index (info->p4est->trees, side1->treeid);
+          tree1 = p4est_tree_array_index (trees, side1->treeid);
           qid1 = side1->quadid + (side1->is_ghost ? mesh->local_num_quadrants
                                   : tree1->quadrants_offset);
           visited[j] = 1;
@@ -169,8 +174,7 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
         /* This side as in the opposite tree */
         fc2 = p4est_corner_face_corners[side2->corner][f2];
         P4EST_ASSERT (0 <= fc2 && fc2 < P4EST_HALF);
-        code = info->p4est->connectivity->tree_to_face[P4EST_FACES *
-                                                       side1->treeid + f1];
+        code = conn->tree_to_face[P4EST_FACES * side1->treeid + f1];
         orientation = code / P4EST_FACES;
         P4EST_ASSERT (f2 == code % P4EST_FACES);
 #ifdef P4_TO_P8
@@ -186,7 +190,7 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
         }
 
         /* We have found a diagonally opposite second side */
-        tree2 = p4est_tree_array_index (info->p4est->trees, side2->treeid);
+        tree2 = p4est_tree_array_index (trees, side2->treeid);
         qid2 = side2->quadid + (side2->is_ghost ? mesh->local_num_quadrants
                                 : tree2->quadrants_offset);
         if (!side1->is_ghost) {
@@ -223,7 +227,7 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
     for (zz = 0; zz < cz; ++zz) {
       side1 = (p4est_iter_corner_side_t *) sc_array_index (&info->sides, zz);
       if (!side1->is_ghost) {
-        tree1 = p4est_tree_array_index (info->p4est->trees, side1->treeid);
+        tree1 = p4est_tree_array_index (trees, side1->treeid);
         qid1 = side1->quadid + tree1->quadrants_offset;
         P4EST_ASSERT (0 <= qid1 && qid1 < mesh->local_num_quadrants);
         P4EST_ASSERT (mesh->quad_to_corner[P4EST_CHILDREN * qid1 +
@@ -236,25 +240,103 @@ mesh_iter_corner (p4est_iter_corner_info_t * info, void *user_data)
 #endif
 
   if (info->tree_boundary == P4EST_CONNECT_CORNER) {
-    /* True tree corner neighbors are not implemented yet: set to -2 */
+    int                 c1, ncorner[P4EST_DIM];
+    int                 nface[P4EST_DIM];
+    int                 ignore;
+    size_t              z2;
+    int8_t             *ccorners;
+    p4est_topidx_t      t1, ntree[P4EST_DIM];
+    p4est_locidx_t      goodones;
+    p4est_locidx_t     *cquads;
+
+    /* Loop through all corner sides, that is the quadrants touching it.  For
+     * each of these quadrants, determine the corner sides that can potentially
+     * occur by being a face neighbor as well.  Exclude these face neighbors
+     * and the quadrant itself, record all others as corner neighbors.
+     */
+    cquads = P4EST_ALLOC (p4est_locidx_t, cz - 1);
+    ccorners = P4EST_ALLOC (int8_t, cz - 1);
     for (zz = 0; zz < cz; ++zz) {
       side1 = (p4est_iter_corner_side_t *) sc_array_index (&info->sides, zz);
       if (!side1->is_ghost) {
-        tree1 = p4est_tree_array_index (info->p4est->trees, side1->treeid);
+        /* We only create corner information for processor-local quadrants */
+        t1 = side1->treeid;
+        c1 = (int) side1->corner;
+        tree1 = p4est_tree_array_index (trees, t1);
         qid1 = side1->quadid + tree1->quadrants_offset;
         P4EST_ASSERT (0 <= qid1 && qid1 < mesh->local_num_quadrants);
-        P4EST_ASSERT (mesh->quad_to_corner[P4EST_CHILDREN * qid1 +
-                                           side1->corner] == -1);
-        mesh->quad_to_corner[P4EST_CHILDREN * qid1 + side1->corner] = -2;
+        P4EST_ASSERT (mesh->quad_to_corner[P4EST_CHILDREN * qid1 + c1] == -1);
+
+        /* Check all quadrant faces that touch this corner */
+        for (i = 0; i < P4EST_DIM; ++i) {
+          f1 = p4est_corner_faces[c1][i];
+          ntree[i] = conn->tree_to_tree[P4EST_FACES * t1 + f1];
+          nface[i] = conn->tree_to_face[P4EST_FACES * t1 + f1];
+          if (ntree[i] == t1 && nface[i] == f1) {
+            /* This is a physical face boundary, no face neighbor present */
+            ncorner[i] = -1;
+            continue;
+          }
+          /* We have a face neighbor */
+          orientation = nface[i] / P4EST_FACES;
+          nface[i] %= P4EST_FACES;
+          ncorner[i] = p4est_connectivity_face_neighbor_corner_orientation
+            (c1, f1, nface[i], orientation);
+        }
+
+        /* Go through corner neighbors and collect the true corners */
+        goodones = 0;
+        for (z2 = 0; z2 < cz; ++z2) {
+          if (z2 == zz) {
+            /* We do not count ourselves as a neighbor */
+            continue;
+          }
+          ignore = 0;
+          side2 =
+            (p4est_iter_corner_side_t *) sc_array_index (&info->sides, z2);
+          P4EST_ASSERT (side2->corner >= 0);
+          for (i = 0; i < P4EST_DIM; ++i) {
+            /* Ignore if this is one of the face neighbors' corners */
+            if (ncorner[i] == (int) side2->corner &&
+                ntree[i] == side2->treeid) {
+              ignore = 1;
+              break;
+            }
+          }
+          if (ignore) {
+            continue;
+          }
+
+          /* Record this corner neighbor */
+          tree2 = p4est_tree_array_index (trees, side2->treeid);
+          qid2 = side2->quadid + (side2->is_ghost ? mesh->local_num_quadrants
+                                  : tree2->quadrants_offset);
+          cquads[goodones] = qid2;
+          ccorners[goodones] = (int) side2->corner;
+          ++goodones;
+        }
+        P4EST_ASSERT ((size_t) goodones < cz);
+        if (goodones == 0) {
+          continue;
+        }
+
+        /* Allocate and fill corner information in the mesh structure */
+        cornerid = mesh_corner_allocate (mesh, goodones, &pcquad, &pccorner);
+        mesh->quad_to_corner[P4EST_CHILDREN * qid1 + c1] =
+          cornerid_offset + cornerid;
+        memcpy (pcquad, cquads, goodones * sizeof (p4est_locidx_t));
+        memcpy (pccorner, ccorners, goodones * sizeof (int8_t));
       }
     }
+    P4EST_FREE (cquads);
+    P4EST_FREE (ccorners);
     return;
   }
 
   /* Process a corner inside the tree in pairs of diagonal neighbors */
   P4EST_ASSERT (!info->tree_boundary);
   side1 = (p4est_iter_corner_side_t *) sc_array_index (&info->sides, 0);
-  tree1 = p4est_tree_array_index (info->p4est->trees, side1->treeid);
+  tree1 = p4est_tree_array_index (trees, side1->treeid);
   qoffset = tree1->quadrants_offset;
   memset (visited, 0, P4EST_CHILDREN * sizeof (int));
   for (i = 0; i < P4EST_HALF; ++i) {
diff --git a/src/p4est_mesh.h b/src/p4est_mesh.h
index 9ed53c5..01abef5 100644
--- a/src/p4est_mesh.h
+++ b/src/p4est_mesh.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -81,8 +82,6 @@ SC_EXTERN_C_BEGIN;
  * Each group contains the quadrant numbers encoded as usual for quad_to_quad
  * in corner_quad, and the corner number from the neighbor as corner_corner.
  *
- * Intra-tree corners and corners across an inter-tree face are implemented.
- * Other Inter-tree corners are NOT IMPLEMENTED and are assigned the value -2.
  * Corners with no diagonal neighbor at all are assigned the value -1.
  */
 typedef struct
diff --git a/src/p4est_nodes.c b/src/p4est_nodes.c
index b2a5d4e..d8120cb 100644
--- a/src/p4est_nodes.c
+++ b/src/p4est_nodes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_nodes.h b/src/p4est_nodes.h
index 870a534..690374a 100644
--- a/src/p4est_nodes.h
+++ b/src/p4est_nodes.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_plex.c b/src/p4est_plex.c
index dcb2d90..b10bd83 100644
--- a/src/p4est_plex.c
+++ b/src/p4est_plex.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -25,10 +26,12 @@
 #include <p4est_bits.h>
 #include <p4est_lnodes.h>
 #include <p4est_plex.h>
+#include <p4est_extended.h>
 #else
 #include <p8est_bits.h>
 #include <p8est_lnodes.h>
 #include <p8est_plex.h>
+#include <p8est_extended.h>
 #endif
 
 /** Decode the information from p{4,8}est_lnodes_t for a given element.
@@ -71,6 +74,31 @@ lnodes_decode2 (p4est_lnodes_code_t face_code,
   return 0;
 }
 
+static int
+p4est_gloidx_pair_compare (const void *a, const void *b)
+{
+  const p4est_gloidx_t *A = (const p4est_gloidx_t *) a;
+  const p4est_gloidx_t *B = (const p4est_gloidx_t *) b;
+
+  if (A[0] < B[0]) {
+    return -1;
+  }
+  else if (A[0] == B[0]) {      /* switch the order for the second value, because we want the value selected by sc_array_uniq (the last, though this is undocumented) to be the smallest */
+    if (A[1] < B[1]) {
+      return 1;
+    }
+    else if (A[1] == B[1]) {
+      return 0;
+    }
+    else {
+      return -1;
+    }
+  }
+  else {
+    return 1;
+  }
+}
+
 static void
 mark_parent (p4est_locidx_t qid, int ctype_int, p4est_lnodes_code_t * F,
              p4est_locidx_t * quad_to_local, int8_t * is_parent,
@@ -138,21 +166,19 @@ fill_orientations (p4est_quadrant_t * q, p4est_topidx_t t,
       nf = conn->tree_to_face[P4EST_FACES * t + f];
       o = nf / P4EST_FACES;
       nf = nf % P4EST_FACES;
-      if (nt != t && nf != f) {
-        if (nt < t || (nt == t && nf < f)) {
-          int                 set;
+      if (nt < t || (nt == t && nf < f)) {
+        int                 set;
 #ifdef P4_TO_P8
-          int                 ref;
+        int                 ref;
 #endif
 
 #ifndef P4_TO_P8
-          set = o;
+        set = o;
 #else
-          ref = p8est_face_permutation_refs[f][nf];
-          set = p8est_face_permutation_sets[ref][o];
+        ref = p8est_face_permutation_refs[f][nf];
+        set = p8est_face_permutation_sets[ref][o];
 #endif
-          quad_to_orientations[f] = set;
-        }
+        quad_to_orientations[f] = set;
       }
     }
   }
@@ -282,9 +308,10 @@ parent_to_child (p4est_quadrant_t * q, p4est_topidx_t t, p4est_locidx_t qid,
                  int ctype_int, p4est_lnodes_code_t * F,
                  p4est_locidx_t * quad_to_local,
                  int8_t * quad_to_orientations,
-                 int8_t * quad_to_orientations_orig, int8_t * referenced,
+                 int8_t * quad_to_orientations_orig,
                  int8_t * node_dim, p4est_locidx_t * child_offsets,
-                 p4est_locidx_t * child_to_id, p4est_connectivity_t * conn)
+                 p4est_locidx_t * child_to_id, p4est_connectivity_t * conn,
+                 int custom_numbering)
 {
 #ifndef P4_TO_P8
   int                 dim_limits[3] = { 0, 4, 8 };
@@ -345,7 +372,6 @@ parent_to_child (p4est_quadrant_t * q, p4est_topidx_t t, p4est_locidx_t qid,
 #endif
             child = child_offsets[quad_to_local[qid * V + v]] + childid;
             quad_to_local[qid * V + v] = child;
-            referenced[child] = 1;
           }
         }
       }
@@ -374,7 +400,6 @@ parent_to_child (p4est_quadrant_t * q, p4est_topidx_t t, p4est_locidx_t qid,
             P4EST_ASSERT (dim == 1 || dim == 2);
             child += (dim == 1) ? 2 : 8;
             quad_to_local[qid * V + v] = child;
-            referenced[child] = 1;
           }
         }
       }
@@ -394,7 +419,6 @@ parent_to_child (p4est_quadrant_t * q, p4est_topidx_t t, p4est_locidx_t qid,
               /* TODO: reconcile intrinsic/extrinsic order */
               child = child_offsets[quad_to_local[qid * V + v]] + (h ^ o);
               quad_to_local[qid * V + v] = child;
-              referenced[child] = 1;
             }
             else {
               int                 i;
@@ -451,8 +475,7 @@ parent_to_child (p4est_quadrant_t * q, p4est_topidx_t t, p4est_locidx_t qid,
                     child_offsets[quad_to_local[qid * V + f]] + P4EST_HALF +
                     child_edge;
                   quad_to_local[qid * V + v] = child;
-                  referenced[child] = 1;
-                  if (child_edge & 1) {
+                  if (!custom_numbering && (child_edge & 1)) {
                     quad_to_orientations[qid * no + P4EST_FACES + edge] ^= 1;
                   }
                   break;
@@ -470,28 +493,31 @@ parent_to_child (p4est_quadrant_t * q, p4est_topidx_t t, p4est_locidx_t qid,
 
 /* *INDENT-OFF* */
 #ifndef P4_TO_P8
-static int p4est_to_plex_child_id[1][3] = {{9, 10, 25}};
-static int p4est_to_plex_face_orientation[4][2] = {{-2,  0},
+static const int p4est_to_plex_child_id[1][3] = {{9, 10, 25}};
+static const int p4est_to_plex_face_orientation[4][2] = {{-2,  0},
                                                    { 0, -2},
                                                    { 0, -2},
                                                    {-2,  0}};
-static int p4est_to_plex_position[1][4] = {{3, 1, 0, 2}};
+static const int p4est_to_plex_position[1][4] = {{3, 1, 0, 2}};
 #else
-static int p4est_to_plex_child_id[2][9] =
+static const int p4est_to_plex_child_id_orig[2][9] =
                                      {{15, 16, 18, 17, 90, 88, 87, 89, 137},
                                       {63, 64, 125, -1, -1, -1, -1, -1, -1}};
-static int p4est_to_plex_face_orientation[6][8] =
+static const int p4est_to_plex_child_id_custom[2][9] =
+                                     {{15, 16, 17, 18, 87, 88, 89, 90, 137},
+                                      {63, 64, 125, -1, -1, -1, -1, -1, -1}};
+static const int p4est_to_plex_face_orientation[6][8] =
                                            {{-4,  0,  3, -1, -3,  1,  2, -2},
                                             { 0, -4, -1,  3,  1, -3, -2,  2},
                                             { 0, -4, -1,  3,  1, -3, -2,  2},
                                             {-1,  1,  0, -2, -4,  2,  3, -3},
                                             {-4,  0,  3, -1, -3,  1,  2, -2},
                                             { 0, -4, -1,  3,  1, -3, -2,  2}};
-static int p4est_to_plex_edge_orientation[4][2] = {{-2,  0},
+static const int p4est_to_plex_edge_orientation[4][2] = {{-2,  0},
                                                    { 0, -2},
                                                    { 0, -2},
                                                    {-2,  0}};
-static int p4est_to_plex_position[2][6] = {{5, 4, 2, 3, 0, 1},
+static const int p4est_to_plex_position[2][6] = {{5, 4, 2, 3, 0, 1},
                                            {3, 1, 0, 2, -1, -1}};
 #endif
 /* *INDENT-ON* */
@@ -506,7 +532,7 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
                          sc_array_t * out_vertex_coords,
                          sc_array_t * out_children, sc_array_t * out_parents,
                          sc_array_t * out_childids, sc_array_t * out_leaves,
-                         sc_array_t * out_remotes)
+                         sc_array_t * out_remotes, int custom_numbering)
 {
 #ifndef P4_TO_P8
   int                 dim_limits[3] = { 0, 4, 8 };
@@ -514,13 +540,14 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
 #else
   int                 dim_limits[4] = { 0, 6, 18, 26 };
   int                 no = P4EST_FACES + P8EST_EDGES;
+  const int           (*p4est_to_plex_child_id)[9] = custom_numbering ?
+    p4est_to_plex_child_id_custom : p4est_to_plex_child_id_orig;
 #endif
   p4est_locidx_t     *cones;
-  int                *orientations;
+  p4est_locidx_t     *orientations;
   sc_array_t         *child_to_parent, *child_to_id;
   p4est_locidx_t     *quad_to_local, *quad_to_local_orig = NULL;
   int8_t             *quad_to_orientations, *quad_to_orientations_orig = NULL;
-  int8_t             *referenced;
   p4est_lnodes_code_t *F;
   p4est_topidx_t      t, flt = p4est->first_local_tree;
   p4est_topidx_t      llt = p4est->last_local_tree;
@@ -613,7 +640,7 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
   }
 
   /* assign a local index to each global node referenced */
-  sc_array_sort (all_global, p4est_gloidx_compare);
+  sc_array_sort (all_global, p4est_gloidx_pair_compare);
   quad_to_local = P4EST_ALLOC (p4est_locidx_t, K * V);
   num_global = 0;
   last_global = -1;
@@ -643,8 +670,7 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
   /* in lnodes, hanging faces/edges/corners are not assigned indices, they are
    * simply references to the anchor points with a hanging type arrow.  In
    * plex, however, all points have indices, so we have to expand these points
-   * at the end of the list of local nodes (they do not, however, get global
-   * indices) */
+   * at the end of the list of local nodes. */
   /* figure out which nodes are parents and mark node dimensions */
   is_parent = sc_array_new_size (sizeof (int8_t), num_global);
   node_dim = sc_array_new_size (sizeof (int8_t), num_global);
@@ -653,8 +679,99 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
     mark_parent (qid, ctype_int, F, quad_to_local,
                  (int8_t *) is_parent->array, (int8_t *) node_dim->array);
   }
+  /* share the results of marking parents */
+  if (mpisize > 1) {
+    sc_array_t         *is_parent_lnodes;
+    p4est_lnodes_buffer_t *buffer;
+    size_t              zz;
+
+    /* share equivalent lnodes */
+    is_parent_lnodes =
+      sc_array_new_size (sizeof (int8_t), (size_t) lnodes->num_local_nodes);
+    memset (is_parent_lnodes->array, 0,
+            is_parent_lnodes->elem_count * is_parent_lnodes->elem_size);
+    for (il = 0; il < Klocal; il++) {
+      for (v = 0; v < V; v++) {
+        p4est_locidx_t      lidx = quad_to_local[il * V + v];
+        p4est_locidx_t      eidx = lnodes->element_nodes[il * V + v];
+
+        *((int8_t *) sc_array_index (is_parent_lnodes, eidx)) |=
+          *((int8_t *) sc_array_index (is_parent, lidx));
+      }
+    }
+    buffer = p4est_lnodes_share_all (is_parent_lnodes, lnodes);
+    for (zz = 0; zz < lnodes->sharers->elem_count; zz++) {
+      p4est_lnodes_rank_t *rank =
+        p4est_lnodes_rank_array_index (lnodes->sharers, zz);
+      sc_array_t         *shared_nodes = &(rank->shared_nodes);
+      sc_array_t         *recv =
+        (sc_array_t *) sc_array_index (buffer->recv_buffers, zz);
+      size_t              zy;
+
+      if (rank->rank == mpirank) {
+        continue;
+      }
+      for (zy = 0; zy < shared_nodes->elem_count; zy++) {
+        il = *((p4est_locidx_t *) sc_array_index (shared_nodes, zy));
+        int8_t              val = *((int8_t *) sc_array_index (recv, zy));
+
+        *((int8_t *) sc_array_index (is_parent_lnodes, il)) |= val;
+      }
+    }
+    p4est_lnodes_buffer_destroy (buffer);
+    for (il = 0; il < Klocal; il++) {
+      for (v = 0; v < V; v++) {
+        p4est_locidx_t      lidx = quad_to_local[il * V + v];
+        p4est_locidx_t      eidx = lnodes->element_nodes[il * V + v];
+
+        *((int8_t *) sc_array_index (is_parent, lidx)) |=
+          *((int8_t *) sc_array_index (is_parent_lnodes, eidx));
+      }
+    }
+    sc_array_destroy (is_parent_lnodes);
+    if (overlap) {              /* share to ghosts */
+      int8_t            **mirror_data;
+      sc_array_t         *is_parent_quad;
+
+      mirror_data = P4EST_ALLOC (int8_t *, num_mirrors);
+      is_parent_quad = sc_array_new_size (V * sizeof (int8_t), K);
+      for (il = 0; il < Klocal; il++) {
+        int8_t             *vals =
+          (int8_t *) sc_array_index (is_parent_quad, il);
+
+        for (v = 0; v < V; v++) {
+          p4est_locidx_t      lidx = quad_to_local[il * V + v];
+
+          vals[v] = *((int8_t *) sc_array_index (is_parent, lidx));
+        }
+      }
+      for (il = 0; il < num_mirrors; il++) {
+        p4est_quadrant_t   *q;
+
+        q = p4est_quadrant_array_index (&ghost->mirrors, il);
+        qid = q->p.piggy3.local_num;
+        mirror_data[il] = (int8_t *) sc_array_index (is_parent_quad, qid);
+      }
+      p4est_ghost_exchange_custom (p4est, ghost,
+                                   (size_t) V * sizeof (int8_t),
+                                   (void **) mirror_data, (int8_t *)
+                                   sc_array_index (is_parent_quad, Klocal));
+      P4EST_FREE (mirror_data);
+      for (il = Klocal; il < K; il++) {
+        int8_t             *vals =
+          (int8_t *) sc_array_index (is_parent_quad, il);
+
+        for (v = 0; v < V; v++) {
+          p4est_locidx_t      lidx = quad_to_local[il * V + v];
+
+          *((int8_t *) sc_array_index (is_parent, lidx)) |= vals[v];
+        }
+      }
+      sc_array_destroy (is_parent_quad);
+    }
+  }
   child_offsets = P4EST_ALLOC (p4est_locidx_t, num_global + 1);
-  /* childredn are appended to the list of global nodes */
+  /* children are appended to the list of global nodes */
   child_offsets[0] = num_global;
   for (il = 0; il < num_global; il++) {
     int8_t              parent =
@@ -676,7 +793,6 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
     }
     child_offsets[il + 1] = child_offsets[il] + count;
   }
-  sc_array_destroy (is_parent);
   num_global_plus_children = child_offsets[num_global];
 
   /* expand the node_dim array so that they also have entries
@@ -734,7 +850,6 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
   /* loop over quads:
    * - where quad_to_local refers to a parent, make it refer to the correct
    *   child
-   * - mark children that are actually referenced (some may not be)
    * - fill quad_to_orientations
    */
   quad_to_orientations = P4EST_ALLOC (int8_t, K * no);
@@ -745,10 +860,6 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
 #ifdef P4EST_ENABLE_DEBUG
   memset (quad_to_orientations, -1, K * no * sizeof (int8_t));
 #endif
-  referenced = P4EST_ALLOC_ZERO (int8_t, num_global_plus_children);
-  for (il = 0; il < num_global; il++) {
-    referenced[il] = 1;
-  }
   for (qid = 0, t = flt; t <= llt; t++) {
     p4est_tree_t       *tree = p4est_tree_array_index (p4est->trees, t);
     sc_array_t         *quadrants = &(tree->quadrants);
@@ -761,10 +872,9 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
       parent_to_child (q, t, qid, ctype_int, F, quad_to_local,
                        quad_to_orientations,
                        quad_to_orientations_orig,
-                       referenced,
                        (int8_t *) node_dim->array, child_offsets,
                        (p4est_locidx_t *) child_to_id->array,
-                       p4est->connectivity);
+                       p4est->connectivity, custom_numbering);
     }
   }
   if (overlap) {
@@ -779,77 +889,19 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
         parent_to_child (q, t, il + Klocal, ctype_int, F, quad_to_local,
                          quad_to_orientations,
                          quad_to_orientations_orig,
-                         referenced,
                          (int8_t *) node_dim->array, child_offsets,
                          (p4est_locidx_t *) child_to_id->array,
-                         p4est->connectivity);
+                         p4est->connectivity, custom_numbering);
       }
     }
     P4EST_FREE (F);
   }
-  P4EST_FREE (child_offsets);
 #ifdef P4EST_ENABLE_DEBUG
   for (il = 0; il < K * no; il++) {
     P4EST_ASSERT (quad_to_orientations[il] >= 0);
   }
 #endif
 
-  /* compress unreferenced children out of the local list */
-  {
-    p4est_locidx_t     *old_to_new, *new_to_old;
-    p4est_locidx_t      new_count, diff;
-
-    old_to_new = P4EST_ALLOC (p4est_locidx_t, num_global_plus_children);
-    new_to_old = P4EST_ALLOC (p4est_locidx_t, num_global_plus_children);
-
-    memset (old_to_new, -1, num_global_plus_children * sizeof (*old_to_new));
-    memset (new_to_old, -1, num_global_plus_children * sizeof (*new_to_old));
-    new_count = 0;
-    for (il = 0; il < num_global_plus_children; il++) {
-      if (referenced[il]) {
-        p4est_locidx_t      newidx;
-
-        newidx = new_count++;
-        old_to_new[il] = newidx;
-        new_to_old[newidx] = il;
-      }
-    }
-    P4EST_ASSERT (new_count >= num_global
-                  && new_count <= num_global_plus_children);
-    diff = num_global_plus_children - new_count;
-    num_global_plus_children -= diff;
-    for (il = 0; il < num_global_plus_children; il++) {
-      p4est_locidx_t      oldidx = new_to_old[il];
-      p4est_locidx_t     *pidold, *pidnew, *cidold, *cidnew;
-      int8_t             *dimold, *dimnew;
-
-      if (oldidx != il) {
-        cidold =
-          (p4est_locidx_t *) sc_array_index (child_to_id, (size_t) oldidx);
-        cidnew = (p4est_locidx_t *) sc_array_index (child_to_id, (size_t) il);
-        *cidnew = *cidold;
-        pidold =
-          (p4est_locidx_t *) sc_array_index (child_to_parent,
-                                             (size_t) oldidx);
-        pidnew =
-          (p4est_locidx_t *) sc_array_index (child_to_parent, (size_t) il);
-        *pidnew = *pidold;
-        dimold = (int8_t *) sc_array_index (node_dim, (size_t) oldidx);
-        dimnew = (int8_t *) sc_array_index (node_dim, (size_t) il);
-        *dimnew = *dimold;
-      }
-    }
-    sc_array_resize (child_to_id, num_global_plus_children);
-    sc_array_resize (child_to_parent, num_global_plus_children);
-    sc_array_resize (node_dim, num_global_plus_children);
-    for (il = 0; il < K * V; il++) {
-      quad_to_local[il] = old_to_new[quad_to_local[il]];
-    }
-    P4EST_FREE (old_to_new);
-    P4EST_FREE (new_to_old);
-    P4EST_FREE (referenced);
-  }
-
   /* now that we have the list of local nodes with children,
    * we have to assign each local node a plex index: plex indices are
    * stratified, and include the cells */
@@ -934,8 +986,11 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
 #endif
     point_count = 0;
     /* figure out the locations of ghosts within the base */
-    Gpre = (overlap && local_first) ? 0 : (ghost->proc_offsets[mpirank] -
-                                           ghost->proc_offsets[0]);
+    Gpre =
+      overlap ? (local_first ? 0
+                 : (ghost->proc_offsets[mpirank] -
+                    ghost->proc_offsets[0])) : 0;
+    *first_local_quad = Gpre;
     Gpost = overlap ?
       (local_first ? G : (ghost->proc_offsets[mpisize] -
                           ghost->proc_offsets[mpirank + 1])) : 0;
@@ -1000,10 +1055,26 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
       plex_to_local[pid] = nid;
       local_to_plex[nid] = pid;
       if (gid >= 0) {
+
         plex_to_proc[pid] = p;
       }
-      else {
-        plex_to_proc[pid] = mpirank;
+    }
+    for (il = 0; il < num_global; il++) {
+      p4est_locidx_t      pid, nid;
+      int8_t              isp;
+      int                 p;
+
+      nid = il + K;
+      pid = local_to_plex[nid];
+      isp = *((int8_t *) sc_array_index (is_parent, il));
+      if (isp) {
+        p4est_locidx_t      cstart = child_offsets[il];
+        p4est_locidx_t      cend = child_offsets[il + 1], c;
+
+        p = plex_to_proc[pid];
+        for (c = cstart; c < cend; c++) {
+          plex_to_proc[local_to_plex[c + K]] = p;
+        }
       }
     }
     P4EST_FREE (lnode_global_offset);
@@ -1041,8 +1112,6 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
         *outid = p4est_to_plex_child_id[P4EST_DIM - 1 - pdim][id];
       }
     }
-    sc_array_destroy (child_to_parent);
-    sc_array_destroy (child_to_id);
 
     /* compute cones and orientations */
     for (qid = 0; qid < K; qid++) {
@@ -1210,6 +1279,113 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
 #endif
       }
     }
+    /* correct for floating children */
+    for (il = 0; il < num_global; il++) {
+      p4est_locidx_t      cstart, cend, ppid;
+      p4est_locidx_t     *pcones, poff;
+#ifdef P4_TO_P8
+      p4est_locidx_t     *ornts;
+#endif
+      int8_t              pdim;
+
+      cstart = child_offsets[il];
+      cend = child_offsets[il + 1];
+      if (cstart == cend) {
+        continue;
+      }
+      pdim = *((int8_t *) sc_array_index (node_dim, (size_t) il));
+      ppid = local_to_plex[K + il];
+      poff =
+        dim_cone_offsets[P4EST_DIM - pdim] + 2 * pdim * (ppid -
+                                                         dim_offsets[P4EST_DIM
+                                                                     - pdim]);
+      pcones = &cones[poff];
+#ifdef P4_TO_P8
+      ornts = &orientations[poff];
+#endif
+      for (c = cstart; c < cend - 1; c++) {
+        p4est_locidx_t      cpid;
+        int8_t              cdim;
+        p4est_locidx_t     *ccones, coff;
+        p4est_locidx_t     *cornts;
+
+        cpid = local_to_plex[K + c];
+        cdim = *((int8_t *) sc_array_index (node_dim, (size_t) c));
+        coff =
+          dim_cone_offsets[P4EST_DIM - cdim] + 2 * cdim * (cpid -
+                                                           dim_offsets
+                                                           [P4EST_DIM -
+                                                            cdim]);
+        ccones = &cones[coff];
+        cornts = &orientations[coff];
+        if (cdim == 1) {
+          int                 side = (c - cstart) & 1;
+
+          cornts[0] = cornts[1] = 0;
+          if (pdim == 1) {
+            ccones[1 - side] = local_to_plex[K + cend - 1];
+            ccones[side] = pcones[side];
+          }
+#ifdef P4_TO_P8
+          else {
+            p4est_locidx_t      epid, lepid, estart, eend;
+
+            epid = pcones[p4est_to_plex_position[1][c - (cstart + 4)]];
+            lepid = plex_to_local[epid] - K;
+
+            estart = child_offsets[lepid];
+            eend = child_offsets[lepid + 1];
+            P4EST_ASSERT (eend > estart);
+            ccones[custom_numbering ? (1 - side) : 1] =
+              local_to_plex[K + cend - 1];
+            ccones[custom_numbering ? side : 0] = local_to_plex[K + eend - 1];
+          }
+#endif
+        }
+#ifdef P4_TO_P8
+        else {
+          int                 j;
+          p4est_locidx_t      cone_to_child[4][4] =
+            { {-1, 6, 4, -4}, {-1, -2, 5, 6}, {4, 7, -3, -4}, {5, -2, -3,
+                                                               7}
+          };
+          int                 cone_to_side[4][4] =
+            { {0, -1, -1, 1}, {1, 0, -1, -1}, {-1, -1, 1, 0}, {-1, 1, 0,
+                                                               -1}
+          };
+
+          for (j = 0; j < 4; j++) {
+            p4est_locidx_t      nchild = cone_to_child[c - cstart][j];
+            if (nchild >= 0) {
+              ccones[j] = local_to_plex[K + cstart + nchild];
+              cornts[j] =
+                custom_numbering ? ((j < 2) ? 0 : -2)
+                : ((j == ((c - cstart + 1) % 4)) ? 0 : -2);
+            }
+            else {
+              int                 epid, lepid, estart, eend;
+              int                 side = cone_to_side[c - cstart][j];
+
+              epid = pcones[-(nchild + 1)];
+              lepid = plex_to_local[epid] - K;
+
+              estart = child_offsets[lepid];
+              eend = child_offsets[lepid + 1];
+              P4EST_ASSERT (eend > estart);
+              cornts[j] = ornts[j];
+              if (!ornts[-(nchild + 1)]) {
+                ccones[j] = local_to_plex[K + estart + side];
+              }
+              else {
+                ccones[j] = local_to_plex[K + estart + (1 - side)];
+              }
+            }
+          }
+        }
+#endif
+      }
+
+    }
 #ifdef P4EST_ENABLE_DEBUG
     {
       size_t              zz, count = out_cones->elem_count;
@@ -1276,6 +1452,68 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
         }
       }
     }
+    /* interpolate children coordinates from parent vertices */
+    for (il = 0; il < num_global; il++) {
+      p4est_locidx_t      pid, nid;
+      int8_t              isp;
+
+      nid = il + K;
+      pid = local_to_plex[nid];
+      isp = *((int8_t *) sc_array_index (is_parent, il));
+      if (isp) {
+        p4est_locidx_t      cvert = child_offsets[il + 1] - 1, poff, vid;
+        int8_t              pdim;
+        p4est_locidx_t     *pcones;
+
+        pdim = *((int8_t *) sc_array_index (node_dim, (size_t) il));
+        poff =
+          dim_cone_offsets[P4EST_DIM - pdim] + 2 * pdim * (pid -
+                                                           dim_offsets
+                                                           [P4EST_DIM -
+                                                            pdim]);
+        pcones = &cones[poff];
+        vid = local_to_plex[cvert + K] - dim_offsets[P4EST_DIM];
+        if (pdim == 1) {
+          p4est_locidx_t      pvid[2];
+
+          pvid[0] = pcones[0] - dim_offsets[P4EST_DIM];
+          pvid[1] = pcones[1] - dim_offsets[P4EST_DIM];
+          coords[3 * vid + 0] =
+            0.5 * (coords[3 * pvid[0] + 0] + coords[3 * pvid[1] + 0]);
+          coords[3 * vid + 1] =
+            0.5 * (coords[3 * pvid[0] + 1] + coords[3 * pvid[1] + 1]);
+          coords[3 * vid + 2] =
+            0.5 * (coords[3 * pvid[0] + 2] + coords[3 * pvid[1] + 2]);
+        }
+#ifdef P4_TO_P8
+        else {
+          int                 j, k;
+
+          coords[3 * vid + 0] = 0.;
+          coords[3 * vid + 1] = 0.;
+          coords[3 * vid + 2] = 0.;
+
+          for (j = 0; j < 4; j++) {
+            p4est_locidx_t      coff, *ccones;
+            p4est_locidx_t      pvid[2];
+
+            coff =
+              dim_cone_offsets[P4EST_DIM - 1] + 2 * (pcones[j] -
+                                                     dim_offsets[P4EST_DIM -
+                                                                 1]);
+            ccones = &cones[coff];
+
+            pvid[0] = ccones[0] - dim_offsets[P4EST_DIM];
+            pvid[1] = ccones[1] - dim_offsets[P4EST_DIM];
+            for (k = 0; k < 3; k++) {
+              coords[3 * vid + k] +=
+                0.125 * (coords[3 * pvid[0] + k] + coords[3 * pvid[1] + k]);
+            }
+          }
+        }
+#endif
+      }
+    }
     if (overlap) {
       for (t = 0; t < p4est->connectivity->num_trees; t++) {
         p4est_locidx_t      il, istart = ghost->tree_offsets[t];
@@ -1338,7 +1576,6 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
       P4EST_FREE (quad_to_local_orig);
       P4EST_FREE (quad_to_orientations_orig);
     }
-    sc_array_destroy (node_dim);
 
     {
       sc_array_t         *quad_to_plex;
@@ -1389,8 +1626,10 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
       }
       /* communicate all_global * local_to_plex to build leaves and remotes */
       lnodes_to_plex =
-        sc_array_new_size (sizeof (p4est_locidx_t), lnodes->num_local_nodes);
-      quad_to_plex = sc_array_new_size (sizeof (p4est_locidx_t), V * K);
+        sc_array_new_size (sizeof (p4est_locidx_t) * (P4EST_DIM + 1),
+                           lnodes->num_local_nodes);
+      quad_to_plex =
+        sc_array_new_size (sizeof (p4est_locidx_t) * (P4EST_DIM + 1), V * K);
       if (lnodes->owned_count) {
         ssize_t             firstidx;
 
@@ -1399,24 +1638,59 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
                             p4est_gloidx_compare);
         P4EST_ASSERT (firstidx >= 0);
         for (il = 0; il < lnodes->owned_count; il++) {
+          p4est_gloidx_t     *gid =
+            (p4est_gloidx_t *) sc_array_index (all_global,
+                                               (size_t) firstidx + il);
+          p4est_locidx_t      eid = (p4est_locidx_t) gid[1];
+          p4est_locidx_t      lid = lnodes->element_nodes[eid];
+          p4est_locidx_t      loc, cstart, cend, j;
+
+          P4EST_ASSERT (gid[0] == lnodes->global_offset + il);
+          P4EST_ASSERT (eid / V < lnodes->num_local_elements);
           p4est_locidx_t     *lp =
-            (p4est_locidx_t *) sc_array_index (lnodes_to_plex, (size_t) il);
+            (p4est_locidx_t *) sc_array_index (lnodes_to_plex, (size_t) lid);
 
-          *lp = local_to_plex[firstidx + il + K];
+          loc = firstidx + il;
+          lp[0] = local_to_plex[loc + K];
+          for (j = 1; j < P4EST_DIM + 1; j++) {
+            lp[j] = -1;
+          }
+          if (loc < num_global) {
+            cstart = child_offsets[loc];
+            cend = child_offsets[loc + 1];
+            if (cend > cstart) {
+              int8_t              ndim =
+                *((int8_t *) sc_array_index (node_dim, loc)), d;
+
+              for (d = ndim; d >= 0; d--) {
+                lp[P4EST_DIM - d] = local_to_plex[cstart + K];
+                if (d == ndim) {
+                  cstart += 2 * ndim;
+                }
+#ifdef P4_TO_P8
+                else if (d == 1) {
+                  cstart += 4;
+                }
+#endif
+              }
+            }
+          }
         }
       }
       p4est_lnodes_share_owned (lnodes_to_plex, lnodes);
       for (il = 0; il < Klocal; il++) {
         for (v = 0; v < V; v++) {
+          int                 j;
           p4est_locidx_t      nid = lnodes->element_nodes[il * V + v];
-          p4est_locidx_t      lp = *((p4est_locidx_t *)
-                                     sc_array_index (lnodes_to_plex,
-                                                     (size_t) nid));
+          p4est_locidx_t     *lp =
+            (p4est_locidx_t *) sc_array_index (lnodes_to_plex, (size_t) nid);
           p4est_locidx_t     *qp =
             (p4est_locidx_t *) sc_array_index (quad_to_plex,
                                                (size_t) (il * V + v));
 
-          *qp = lp;
+          for (j = 0; j < P4EST_DIM + 1; j++) {
+            qp[j] = lp[j];
+          }
         }
       }
       sc_array_destroy (lnodes_to_plex);
@@ -1433,47 +1707,143 @@ p4est_get_plex_data_int (p4est_t * p4est, p4est_ghost_t * ghost,
             (p4est_locidx_t *) sc_array_index (quad_to_plex, qid * V);
         }
         p4est_ghost_exchange_custom (p4est, ghost,
-                                     (size_t) V * sizeof (p4est_locidx_t),
+                                     (size_t) V * (P4EST_DIM +
+                                                   1) *
+                                     sizeof (p4est_locidx_t),
                                      (void **) mirror_data, (p4est_locidx_t *)
                                      sc_array_index (quad_to_plex,
                                                      Klocal * V));
         P4EST_FREE (mirror_data);
       }
-      for (il = 0; il < num_global; il++) {
+      for (il = 0; il < num_global_plus_children; il++) {
         p4est_locidx_t      localpid = il + K;
 
         p = plex_to_proc[localpid];
         if (p != mpirank) {
-          p4est_gloidx_t     *gid =
-            (p4est_gloidx_t *) sc_array_index (all_global, il);
-          p4est_locidx_t      nid = gid[1];
-          p4est_locidx_t      pid =
-            *((p4est_locidx_t *) sc_array_index (quad_to_plex, (size_t) nid));
-          p4est_locidx_t     *leaf =
-            (p4est_locidx_t *) sc_array_push (out_leaves);
-          p4est_locidx_t     *remote =
-            (p4est_locidx_t *) sc_array_push (out_remotes);
-
-          *leaf = localpid;
-          remote[0] = p;
-          remote[1] = pid;
+          p4est_locidx_t      lid = plex_to_local[localpid] - K;
+
+          if (lid < num_global) {
+            p4est_gloidx_t     *gid =
+              (p4est_gloidx_t *) sc_array_index (all_global, lid);
+            p4est_locidx_t      eid = gid[1];
+            p4est_locidx_t      pid = *((p4est_locidx_t *)
+                                        sc_array_index (quad_to_plex,
+                                                        (size_t) eid));
+            p4est_locidx_t     *leaf =
+              (p4est_locidx_t *) sc_array_push (out_leaves);
+            p4est_locidx_t     *remote =
+              (p4est_locidx_t *) sc_array_push (out_remotes);
+
+            *leaf = localpid;
+            remote[0] = p;
+            remote[1] = pid;
+          }
+          else {
+            p4est_locidx_t      parent =
+              *((p4est_locidx_t *) sc_array_index (child_to_parent, lid));
+            p4est_locidx_t      id =
+              *((p4est_locidx_t *) sc_array_index (child_to_id, lid));
+            int8_t              dim =
+              *((int8_t *) sc_array_index (node_dim, lid));
+
+            P4EST_ASSERT (parent >= 0);
+            {
+              p4est_gloidx_t     *pgid =
+                (p4est_gloidx_t *) sc_array_index (all_global, parent);
+              p4est_locidx_t      peid = pgid[1];
+              p4est_locidx_t     *ppid =
+                (p4est_locidx_t *) sc_array_index (quad_to_plex,
+                                                   (size_t) peid);
+              p4est_locidx_t     *leaf =
+                (p4est_locidx_t *) sc_array_push (out_leaves);
+              p4est_locidx_t     *remote =
+                (p4est_locidx_t *) sc_array_push (out_remotes);
+
+              *leaf = localpid;
+              remote[0] = p;
+              remote[1] = -1;
+              if (dim == 0) {
+                remote[1] = ppid[P4EST_DIM - dim];
+              }
+              else if (dim == P4EST_DIM - 1) {
+                remote[1] = ppid[P4EST_DIM - dim] + id;
+              }
+#ifdef P4_TO_P8
+              else {
+                remote[1] = ppid[P4EST_DIM - dim] + (id - 4);
+              }
+#endif
+            }
+          }
         }
       }
       sc_array_destroy (quad_to_plex);
     }
+    sc_array_destroy (child_to_parent);
+    sc_array_destroy (child_to_id);
+    sc_array_destroy (node_dim);
     P4EST_FREE (plex_to_local);
     P4EST_FREE (local_to_plex);
     P4EST_FREE (plex_to_proc);
   }
-
   /* cleanup */
+  sc_array_destroy (is_parent);
+  P4EST_FREE (child_offsets);
+
   sc_array_destroy (all_global);
 }
 
 void
+p4est_get_plex_data_ext (p4est_t * p4est,
+                         p4est_ghost_t ** ghost,
+                         p4est_lnodes_t ** lnodes,
+                         p4est_connect_type_t ctype,
+                         int overlap, p4est_locidx_t * first_local_quad,
+                         sc_array_t * out_points_per_dim,
+                         sc_array_t * out_cone_sizes,
+                         sc_array_t * out_cones,
+                         sc_array_t * out_cone_orientations,
+                         sc_array_t * out_vertex_coords,
+                         sc_array_t * out_children,
+                         sc_array_t * out_parents,
+                         sc_array_t * out_childids,
+                         sc_array_t * out_leaves, sc_array_t * out_remotes,
+                         int custom_numbering)
+{
+  int                 ctype_int = p4est_connect_type_int (ctype);
+  int                 i;
+  int                 created_ghost = 0;
+
+  if (!*ghost) {
+    *ghost = p4est_ghost_new (p4est, ctype);
+    created_ghost = 1;
+  }
+  if (!*lnodes) {
+    *lnodes = p4est_lnodes_new (p4est, *ghost, -ctype_int);
+  }
+  if (created_ghost) {
+    if (overlap) {
+      p4est_ghost_support_lnodes (p4est, *lnodes, *ghost);
+    }
+    for (i = 1; i < overlap; i++) {
+      p4est_ghost_expand_by_lnodes (p4est, *lnodes, *ghost);
+    }
+  }
+  if (ctype != P4EST_CONNECT_FULL) {
+    p4est_lnodes_destroy (*lnodes);
+    *lnodes = p4est_lnodes_new (p4est, *ghost, -ctype);
+  }
+  p4est_get_plex_data_int (p4est, *ghost, *lnodes, overlap, 0,
+                           first_local_quad, out_points_per_dim,
+                           out_cone_sizes, out_cones, out_cone_orientations,
+                           out_vertex_coords, out_children, out_parents,
+                           out_childids, out_leaves, out_remotes,
+                           custom_numbering);
+}
+
+void
 p4est_get_plex_data (p4est_t * p4est, p4est_connect_type_t ctype,
-                     int overlap,
-                     p4est_locidx_t * first_local_quad,
+                     int overlap, p4est_locidx_t * first_local_quad,
                      sc_array_t * out_points_per_dim,
                      sc_array_t * out_cone_sizes,
                      sc_array_t * out_cones,
@@ -1484,28 +1854,16 @@ p4est_get_plex_data (p4est_t * p4est, p4est_connect_type_t ctype,
                      sc_array_t * out_childids,
                      sc_array_t * out_leaves, sc_array_t * out_remotes)
 {
-  p4est_ghost_t      *ghost;
-  p4est_lnodes_t     *lnodes;
-  int                 ctype_int = p4est_connect_type_int (ctype);
-  int                 i;
+  p4est_ghost_t      *ghost = NULL;
+  p4est_lnodes_t     *lnodes = NULL;
 
-  ghost = p4est_ghost_new (p4est, ctype);
-  lnodes = p4est_lnodes_new (p4est, ghost, -ctype_int);
-  if (overlap) {
-    p4est_ghost_support_lnodes (p4est, lnodes, ghost);
-  }
-  for (i = 1; i < overlap; i++) {
-    p4est_ghost_expand_by_lnodes (p4est, lnodes, ghost);
-  }
-  if (ctype != P4EST_CONNECT_FULL) {
-    p4est_lnodes_destroy (lnodes);
-    lnodes = p4est_lnodes_new (p4est, ghost, -ctype);
-  }
-  p4est_get_plex_data_int (p4est, ghost, lnodes, overlap, 0,
+  p4est_get_plex_data_ext (p4est, &ghost, &lnodes, ctype, overlap,
                            first_local_quad, out_points_per_dim,
-                           out_cone_sizes, out_cones, out_cone_orientations,
-                           out_vertex_coords, out_children, out_parents,
-                           out_childids, out_leaves, out_remotes);
-  p4est_ghost_destroy (ghost);
+                           out_cone_sizes, out_cones,
+                           out_cone_orientations, out_vertex_coords,
+                           out_children, out_parents, out_childids,
+                           out_leaves, out_remotes, 0);
+
   p4est_lnodes_destroy (lnodes);
+  p4est_ghost_destroy (ghost);
 }
diff --git a/src/p4est_plex.h b/src/p4est_plex.h
index 397342c..3af8ba0 100644
--- a/src/p4est_plex.h
+++ b/src/p4est_plex.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_points.c b/src/p4est_points.c
index 88313d7..7a20bde 100644
--- a/src/p4est_points.c
+++ b/src/p4est_points.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -167,14 +168,14 @@ p4est_new_points (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
   ppstate.maxlevel = maxlevel;
 
   /* assign some data members */
-  p4est->mpicomm = mpicomm;
-  p4est->mpisize = num_procs;
-  p4est->mpirank = rank;
   p4est->data_size = 2 * sizeof (p4est_locidx_t);       /* temporary */
   p4est->user_pointer = &ppstate;
   p4est->connectivity = connectivity;
   num_trees = connectivity->num_trees;
 
+  /* set parallel environment */
+  p4est_comm_parallel_env_assign (p4est, mpicomm);
+
   /* allocate memory pools */
   p4est->user_data_pool = sc_mempool_new (p4est->data_size);
   p4est->quadrant_pool = sc_mempool_new (sizeof (p4est_quadrant_t));
diff --git a/src/p4est_points.h b/src/p4est_points.h
index 169bbeb..8f9ff72 100644
--- a/src/p4est_points.h
+++ b/src/p4est_points.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p4est_search.c b/src/p4est_search.c
index 63e6ef6..c124d20 100644
--- a/src/p4est_search.c
+++ b/src/p4est_search.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,9 +24,11 @@
 
 #ifndef P4_TO_P8
 #include <p4est_bits.h>
+#include <p4est_communication.h>
 #include <p4est_search.h>
 #else
 #include <p8est_bits.h>
+#include <p8est_communication.h>
 #include <p8est_search.h>
 #endif
 
@@ -449,22 +452,30 @@ find_range_boundaries_exit:
   return touch;
 }
 
+/** This recursion context saves on the number of parameters passed. */
+typedef struct p4est_search_recursion
+{
+  p4est_t            *p4est;            /**< Forest being traversed. */
+  p4est_topidx_t      which_tree;       /**< Current tree number. */
+  p4est_search_query_t search_quadrant_fn;      /**< The quadrant callback. */
+  p4est_search_query_t search_point_fn;         /**< The point callback. */
+  sc_array_t         *points;           /**< Array of points to search. */
+}
+p4est_search_recursion_t;
+
 static void
-p4est_search_recursion (p4est_t * p4est, p4est_topidx_t which_tree,
+p4est_search_recursion (const p4est_search_recursion_t * rec,
                         p4est_quadrant_t * quadrant,
-                        p4est_search_query_t search_quadrant_fn,
-                        p4est_search_query_t search_point_fn,
-                        sc_array_t * quadrants,
-                        sc_array_t * points, sc_array_t * actives)
+                        sc_array_t * quadrants, sc_array_t * actives)
 {
+  const size_t        qcount = quadrants->elem_count;
   int                 i;
   int                 is_leaf, is_match;
-  size_t              qcount = quadrants->elem_count;
   size_t              zz, *pz, *qz;
   size_t              split[P4EST_CHILDREN + 1];
   p4est_locidx_t      local_num;
-  p4est_quadrant_t   *q, *lq, children[P4EST_CHILDREN];
-  sc_array_t          child_quadrants, child_actives;
+  p4est_quadrant_t   *q, *lq, child;
+  sc_array_t          child_quadrants, child_actives, *chact;
 
   /*
    * Invariants of the recursion:
@@ -472,10 +483,12 @@ p4est_search_recursion (p4est_t * p4est, p4est_topidx_t which_tree,
    * 2. quadrant is equal to or an ancestor of those in the array.
    */
 
-  P4EST_ASSERT (actives->elem_count <= points->elem_count);
+  P4EST_ASSERT ((rec->points == NULL) == (actives == NULL));
+  P4EST_ASSERT (rec->points == NULL ||
+                actives->elem_count <= rec->points->elem_count);
 
   /* return if there are no quadrants or active points */
-  if (qcount == 0 || actives->elem_count == 0)
+  if (qcount == 0 || (rec->points != NULL && actives->elem_count == 0))
     return;
 
   /* determine leaf situation */
@@ -485,6 +498,10 @@ p4est_search_recursion (p4est_t * p4est, p4est_topidx_t which_tree,
     is_leaf = 0;
     local_num = -1;
     lq = p4est_quadrant_array_index (quadrants, quadrants->elem_count - 1);
+    P4EST_ASSERT (!p4est_quadrant_is_equal (q, lq) &&
+                  p4est_quadrant_is_ancestor (quadrant, lq));
+
+    /* skip unnecessary intermediate levels if possible */
     if (p4est_quadrant_ancestor_id (q, quadrant->level + 1) ==
         p4est_quadrant_ancestor_id (lq, quadrant->level + 1)) {
       p4est_nearest_common_ancestor (q, lq, quadrant);
@@ -501,52 +518,68 @@ p4est_search_recursion (p4est_t * p4est, p4est_topidx_t which_tree,
     is_leaf = 1;
 
     /* determine offset of quadrant in local forest */
-    tree = p4est_tree_array_index (p4est->trees, which_tree);
+    tree = p4est_tree_array_index (rec->p4est->trees, rec->which_tree);
     offset = (p4est_locidx_t) ((quadrants->array - tree->quadrants.array)
                                / sizeof (p4est_quadrant_t));
     P4EST_ASSERT (offset >= 0 &&
                   (size_t) offset < tree->quadrants.elem_count);
     local_num = tree->quadrants_offset + offset;
+
+    /* skip unnecessary intermediate levels if possible */
     quadrant = q;
   }
 
   /* execute quadrant callback if present, which may stop the recursion */
-  if (search_quadrant_fn != NULL &&
-      !search_quadrant_fn (p4est, which_tree, quadrant, local_num, NULL)) {
+  if (rec->search_quadrant_fn != NULL &&
+      !rec->search_quadrant_fn (rec->p4est, rec->which_tree,
+                                quadrant, local_num, NULL)) {
     return;
   }
 
-  /* query callback for all points and return if none remain */
-  sc_array_init (&child_actives, sizeof (size_t));
-  for (zz = 0; zz < actives->elem_count; ++zz) {
-    pz = (size_t *) sc_array_index (actives, zz);
-    is_match = search_point_fn (p4est, which_tree, quadrant, local_num,
-                                sc_array_index (points, *pz));
-    if (!is_leaf && is_match) {
-      qz = (size_t *) sc_array_push (&child_actives);
-      *qz = *pz;
+  /* check out points */
+  if (rec->points == NULL) {
+    /* we have called the callback already.  For leafs we are done */
+    if (is_leaf) {
+      return;
+    }
+    chact = NULL;
+  }
+  else {
+    /* query callback for all points and return if none remain */
+    chact = &child_actives;
+    sc_array_init (chact, sizeof (size_t));
+    for (zz = 0; zz < actives->elem_count; ++zz) {
+      pz = (size_t *) sc_array_index (actives, zz);
+      is_match = rec->search_point_fn (rec->p4est, rec->which_tree,
+                                       quadrant, local_num,
+                                       sc_array_index (rec->points, *pz));
+      if (!is_leaf && is_match) {
+        qz = (size_t *) sc_array_push (chact);
+        *qz = *pz;
+      }
+    }
+    if (chact->elem_count == 0) {
+      return;
     }
   }
-  if (child_actives.elem_count == 0)
-    return;
 
   /* leaf situation has returned above */
   P4EST_ASSERT (!is_leaf);
 
   /* split quadrant array and run recursion */
   p4est_split_array (quadrants, (int) quadrant->level, split);
-  p4est_quadrant_childrenv (quadrant, children);
   for (i = 0; i < P4EST_CHILDREN; ++i) {
+    p4est_quadrant_child (quadrant, &child, i);
     if (split[i] < split[i + 1]) {
       sc_array_init_view (&child_quadrants, quadrants,
                           split[i], split[i + 1] - split[i]);
-      p4est_search_recursion (p4est, which_tree, &children[i],
-                              search_quadrant_fn, search_point_fn,
-                              &child_quadrants, points, &child_actives);
+      p4est_search_recursion (rec, &child, &child_quadrants, chact);
       sc_array_reset (&child_quadrants);
     }
   }
-  sc_array_reset (&child_actives);
+  if (chact != NULL) {
+    sc_array_reset (chact);
+  }
 }
 
 void
@@ -556,58 +589,61 @@ p4est_search (p4est_t * p4est, p4est_search_query_t search_quadrant_fn,
   p4est_topidx_t      jt;
   p4est_tree_t       *tree;
   p4est_quadrant_t    root;
-  p4est_quadrant_t    temp, temp2;
   p4est_quadrant_t   *f, *l;
-  uint64_t            midx;
-  sc_array_t          actives;
+  p4est_search_recursion_t srec, *rec = &srec;
+  sc_array_t          actives, *acts;
   sc_array_t         *tquadrants;
   size_t              zz, *pz;
 
-  for (jt = p4est->first_local_tree; jt <= p4est->last_local_tree; ++jt) {
+  /* correct call convention? */
+  P4EST_ASSERT (p4est != NULL);
+  P4EST_ASSERT (points == NULL || search_point_fn != NULL);
 
-    /* start recursion with root quadrant */
-    p4est_quadrant_set_morton (&root, 0, 0);
-    f = NULL;
-    l = NULL;
-    if (jt == p4est->first_local_tree) {
-      f = &p4est->global_first_position[p4est->mpirank];
-      p4est_quadrant_last_descendant (&root, &temp, P4EST_QMAXLEVEL);
-      l = &temp;
-    }
-    if (jt == p4est->last_local_tree) {
-      if (!f) {
-        p4est_quadrant_first_descendant (&root, &temp, P4EST_QMAXLEVEL);
-        f = &temp;
-      }
-      l = &p4est->global_first_position[p4est->mpirank + 1];
-      if (l->p.which_tree == jt) {
-        midx = p4est_quadrant_linear_id (l, P4EST_QMAXLEVEL);
-        p4est_quadrant_set_morton (&temp2, P4EST_QMAXLEVEL, midx - 1);
-        l = &temp2;
-      }
-      else {
-        p4est_quadrant_last_descendant (&root, &temp2, P4EST_QMAXLEVEL);
-        l = &temp2;
-      }
-    }
-    if (f != NULL) {
-      P4EST_ASSERT (l != NULL);
-      p4est_nearest_common_ancestor (f, l, &root);
+  /* we do nothing if there is nothing we can do */
+  if (search_quadrant_fn == NULL && points == NULL) {
+    return;
+  }
+
+  /* prepare start of recursion by listing the active points */
+  if (points == NULL) {
+    /* we ignore the points logic completely */
+    acts = NULL;
+  }
+  else {
+    /* mark all input points as active */
+    acts = &actives;
+    sc_array_init_size (acts, sizeof (size_t), points->elem_count);
+    for (zz = 0; zz < acts->elem_count; ++zz) {
+      pz = (size_t *) sc_array_index (acts, zz);
+      *pz = zz;
     }
+  }
+
+  /* set recursion context */
+  rec->p4est = p4est;
+  rec->which_tree = -1;
+  rec->search_quadrant_fn = search_quadrant_fn;
+  rec->search_point_fn = search_point_fn;
+  rec->points = points;
+  for (jt = p4est->first_local_tree; jt <= p4est->last_local_tree; ++jt) {
+    rec->which_tree = jt;
 
     /* grab complete tree quadrant array */
     tree = p4est_tree_array_index (p4est->trees, jt);
     tquadrants = &tree->quadrants;
 
-    /* mark all points as active */
-    sc_array_init_size (&actives, sizeof (size_t), points->elem_count);
-    for (zz = 0; zz < points->elem_count; ++zz) {
-      pz = (size_t *) sc_array_index (&actives, zz);
-      *pz = zz;
-    }
+    /* find the smallest quadrant that contains all of this tree */
+    f = p4est_quadrant_array_index (tquadrants, 0);
+    l = p4est_quadrant_array_index (tquadrants, tquadrants->elem_count - 1);
+    p4est_nearest_common_ancestor (f, l, &root);
+
+    /* perform top-down search */
+    p4est_search_recursion (rec, &root, tquadrants, acts);
+  }
 
-    p4est_search_recursion (p4est, jt, &root, search_quadrant_fn,
-                            search_point_fn, tquadrants, points, &actives);
-    sc_array_reset (&actives);
+  /* clean up after the tree loop */
+  if (acts != NULL) {
+    P4EST_ASSERT (points->elem_count == acts->elem_count);
+    sc_array_reset (acts);
   }
 }
diff --git a/src/p4est_search.h b/src/p4est_search.h
index 44a454f..4001d41 100644
--- a/src/p4est_search.h
+++ b/src/p4est_search.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -24,6 +25,13 @@
 #ifndef P4EST_SEARCH_H
 #define P4EST_SEARCH_H
 
+/** \file p4est_search.h
+ * Search through quadrants, the local part of a forest, or the partition.
+ *
+ * This file provides several helper functions and recursive algorithms.
+ * \ingroup p4est
+ */
+
 #include <p4est.h>
 
 SC_EXTERN_C_BEGIN;
@@ -44,14 +52,16 @@ ssize_t             p4est_find_higher_bound (sc_array_t * array,
                                              const p4est_quadrant_t * q,
                                              size_t guess);
 
-/** Given a sorted \a array of quadrants that have a common ancestor at level
- * \a level, compute the \a indices of the first quadrant in each of the common
- * ancestor's children at level \a level + 1;
- * \param [in] array     The sorted array of quadrants of level > \a level.
+/** Split an array of quadrants by the children of an ancestor.
+ *
+ * Given a sorted \b array of quadrants that have a common ancestor at level
+ * \b level, compute the \b indices of the first quadrant in each of the common
+ * ancestor's children at level \b level + 1.
+ * \param [in] array     The sorted array of quadrants of level > \b level.
  * \param [in] level     The level at which there is a common ancestor.
  * \param [in,out] indices     The indices of the first quadrant in each of
  *                             the ancestors's children, plus an additional
- *                             index on the end.  The quadrants of \a array
+ *                             index on the end.  The quadrants of \b array
  *                             that are descendants of child i have indices
  *                             between indices[i] and indices[i + 1] - 1.  If
  *                             indices[i] = indices[i+1], this indicates that
@@ -61,7 +71,9 @@ ssize_t             p4est_find_higher_bound (sc_array_t * array,
 void                p4est_split_array (sc_array_t * array, int level,
                                        size_t indices[]);
 
-/** Given two smallest quadrants, \a lq and \a uq, that mark the first and the
+/** Find the boundary points touched by a range of quadrants.
+ *
+ * Given two smallest quadrants, \b lq and \b uq, that mark the first and the
  * last quadrant in a range of quadrants, determine which portions of the tree
  * boundary the range touches.
  * \param [in] lq        The smallest quadrant at the start of the range: if
@@ -73,14 +85,14 @@ void                p4est_split_array (sc_array_t * array, int level,
  * \param [in] level     The level of the containing quadrant whose boundaries
  *                       are tested: 0 if we want to test the boundaries of the
  *                       whole tree.
- * \param [in/out] faces       An array of size 4 that is filled: faces[i] is
+ * \param [in,out] faces       An array of size 4 that is filled: faces[i] is
  *                             true if the range touches that face.
- * \param [in/out] corners     An array of size 4 that is filled: corners[i] is
+ * \param [in,out] corners     An array of size 4 that is filled: corners[i] is
  *                             true if the range touches that corner.
- *                             \faces or \corners may be NULL.
- * \return  Returns an int32_t encoded with the same information in \faces and
- *          \corners: the first (least) four bits represent the four faces,
- *          the next four bits represent the four corners.
+ *                             \b faces or \b corners may be NULL.
+ * \return  Returns an int32_t encoded with the same information in \b faces
+ *          and \b corners: the first (least) four bits represent the four
+ *          faces, the next four bits represent the four corners.
  */
 int32_t             p4est_find_range_boundaries (p4est_quadrant_t * lq,
                                                  p4est_quadrant_t * uq,
@@ -90,7 +102,7 @@ int32_t             p4est_find_range_boundaries (p4est_quadrant_t * lq,
 /** Callback function to query the match of a "point" with a quadrant.
  *
  * This function can be called in two roles:  Per-quadrant, in which case the
- * parameter \a point is NULL, or per-point, possibly many times per quadrant.
+ * parameter \b point is NULL, or per-point, possibly many times per quadrant.
  *
  * \param [in] p4est        The forest to be queried.
  * \param [in] which_tree   The tree id under consideration.
@@ -104,10 +116,10 @@ int32_t             p4est_find_range_boundaries (p4est_quadrant_t * lq,
  *                          it is the (non-negative) index of the quadrant
  *                          relative to the processor-local quadrant storage.
  * \param [in] point        Representation of a "point"; user-defined.
- *                          If \a point is NULL, the callback may be used to
+ *                          If \b point is NULL, the callback may be used to
  *                          prepare quadrant-related search meta data.
- * \return                  If \a point is NULL, true if the search confined to
- *                          \a quadrant should be executed, false to skip it.
+ * \return                  If \b point is NULL, true if the search confined to
+ *                          \b quadrant should be executed, false to skip it.
  *                          Else, true if point may be contained in the
  *                          quadrant and false otherwise; the return value has
  *                          no effect on a leaf.
@@ -118,13 +130,21 @@ typedef int         (*p4est_search_query_t) (p4est_t * p4est,
                                              p4est_locidx_t local_num,
                                              void *point);
 
-/** Search "points" from a given set in the forest.
+/** Search through the local part of a forest.
+ * The search is especially efficient if multiple targets, called "points"
+ * below, are searched for simultaneously.
  *
  * The search runs over all local quadrants and proceeds recursively top-down.
+ * For each tree, it may start at the root of that tree, or further down at the
+ * root of the subtree that contains all of the tree's local quadrants.
+ * Likewise, some intermediate levels in the recursion may be skipped.
  * Its outer loop is thus a depth-first, processor-local forest traversal.
  * Each quadrant in that loop either is a leaf, or a (direct or indirect)
  * strict ancestor of a leaf.  On entering a new quadrant, a user-provided
  * quadrant-callback is executed.
+ *
+ * As a convenience, the user may provide anonymous "points" that are tracked
+ * down the forest.  This way one search call may be used for multiple targets.
  * The set of points that potentially matches a given quadrant diminishes from
  * the root down to the leaves:  For each quadrant, an inner loop over the
  * potentially matching points executes a point-callback for each candidate
@@ -139,12 +159,20 @@ typedef int         (*p4est_search_query_t) (p4est_t * p4est,
  *
  * \param [in] p4est        The forest to be searched.
  * \param [in] search_quadrant_fn   Executed once for each quadrant that is
- *                          entered.  This quadrant is always local.  If the
+ *                          entered.  This quadrant is always local, if not
+ *                          itself then at least one child of it.  If the
  *                          callback returns false, this quadrant and its
  *                          descendants are excluded from the search.
+ *                          Its \b point argument is always NULL.
  *                          May be NULL in which case it is ignored.
- * \param [in] search_point_fn      Must return true for a possible match.
+ * \param [in] search_point_fn      If \b points is not NULL, must be not NULL.
+ *                          Must return true for any possible matching point.
+ *                          If \b points is NULL, this callback is ignored.
  * \param [in] points       User-defined array of "points".
+ *                          If NULL, only the \b search_quadrant_fn callback
+ *                          is executed.  If that is NULL, this function noops.
+ *                          If not NULL, the \b search_point_fn is called on
+ *                          its members during the search.
  */
 void                p4est_search (p4est_t * p4est,
                                   p4est_search_query_t search_quadrant_fn,
@@ -153,4 +181,4 @@ void                p4est_search (p4est_t * p4est,
 
 SC_EXTERN_C_END;
 
-#endif
+#endif /* !P4EST_SEARCH_H */
diff --git a/src/p4est_to_p8est.h b/src/p4est_to_p8est.h
index ccde2b9..e038a43 100644
--- a/src/p4est_to_p8est.h
+++ b/src/p4est_to_p8est.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,9 +22,13 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
-/** \file p4est_to_p8est.h
+/** We do not process this file in Doxygen since it overwrites types.
  *
- * transforms 2D \ref p4est routines into 3D \ref p8est routines
+ * Transform 2D \ref p4est routines into 3D \ref p8est routines.  This file can
+ * be included from a .c file that has been written for 2D to turn it into a 3D
+ * code with minor modifications.  We #define P4_TO_P8, which allows to compile
+ * a source file twice, once for 2D without and once for 3D with including this
+ * header, adding extra code for 3D if necessary.
  */
 
 #ifndef P4EST_TO_P8EST_H
@@ -51,32 +56,9 @@
 #define P4EST_QUADRANT_LEN              P8EST_QUADRANT_LEN
 #define P4EST_LAST_OFFSET               P8EST_LAST_OFFSET
 #define P4EST_QUADRANT_INIT             P8EST_QUADRANT_INIT
+#define P4EST_LEAF_IS_FIRST_IN_TREE     P8EST_LEAF_IS_FIRST_IN_TREE
 
 /* redefine enums */
-#define P4EST_COMM_COUNT_PERTREE        P8EST_COMM_COUNT_PERTREE
-#define P4EST_COMM_BALANCE_FIRST_COUNT  P8EST_COMM_BALANCE_FIRST_COUNT
-#define P4EST_COMM_BALANCE_FIRST_LOAD   P8EST_COMM_BALANCE_FIRST_LOAD
-#define P4EST_COMM_BALANCE_SECOND_COUNT P8EST_COMM_BALANCE_SECOND_COUNT
-#define P4EST_COMM_BALANCE_SECOND_LOAD  P8EST_COMM_BALANCE_SECOND_LOAD
-#define P4EST_COMM_PARTITION_GIVEN      P8EST_COMM_PARTITION_GIVEN
-#define P4EST_COMM_PARTITION_WEIGHTED_LOW P8EST_COMM_PARTITION_WEIGHTED_LOW
-#define P4EST_COMM_PARTITION_WEIGHTED_HIGH P8EST_COMM_PARTITION_WEIGHTED_HIGH
-#define P4EST_COMM_PARTITION_CORRECTION P8EST_COMM_PARTITION_CORRECTION
-#define P4EST_COMM_GHOST_COUNT          P8EST_COMM_GHOST_COUNT
-#define P4EST_COMM_GHOST_LOAD           P8EST_COMM_GHOST_LOAD
-#define P4EST_COMM_GHOST_EXCHANGE       P8EST_COMM_GHOST_EXCHANGE
-#define P4EST_COMM_GHOST_EXPAND_COUNT   P8EST_COMM_GHOST_EXPAND_COUNT
-#define P4EST_COMM_GHOST_EXPAND_LOAD    P8EST_COMM_GHOST_EXPAND_LOAD
-#define P4EST_COMM_GHOST_SUPPORT_COUNT  P8EST_COMM_GHOST_SUPPORT_COUNT
-#define P4EST_COMM_GHOST_SUPPORT_LOAD   P8EST_COMM_GHOST_SUPPORT_LOAD
-#define P4EST_COMM_GHOST_CHECKSUM       P8EST_COMM_GHOST_CHECKSUM
-#define P4EST_COMM_NODES_QUERY          P8EST_COMM_NODES_QUERY
-#define P4EST_COMM_NODES_REPLY          P8EST_COMM_NODES_REPLY
-#define P4EST_COMM_SAVE                 P8EST_COMM_SAVE
-#define P4EST_COMM_LNODES_TEST          P8EST_COMM_LNODES_TEST
-#define P4EST_COMM_LNODES_PASS          P8EST_COMM_LNODES_PASS
-#define P4EST_COMM_LNODES_OWNED         P8EST_COMM_LNODES_OWNED
-#define P4EST_COMM_LNODES_ALL           P8EST_COMM_LNODES_ALL
 #define P4EST_CONNECT_FACE              P8EST_CONNECT_FACE
 #define P4EST_CONNECT_CORNER            P8EST_CONNECT_CORNER
 #define P4EST_CONNECT_FULL              P8EST_CONNECT_FULL
@@ -105,6 +87,7 @@
 #define p4est_coarsen_t                 p8est_coarsen_t
 #define p4est_weight_t                  p8est_weight_t
 #define p4est_ghost_t                   p8est_ghost_t
+#define p4est_ghost_exchange_t          p8est_ghost_exchange_t
 #define p4est_indep_t                   p8est_indep_t
 #define p4est_nodes_t                   p8est_nodes_t
 #define p4est_lnodes_t                  p8est_lnodes_t
@@ -120,11 +103,13 @@
 #define p4est_iter_corner_side_t        p8est_iter_corner_side_t
 #define p4est_iter_corner_info_t        p8est_iter_corner_info_t
 #define p4est_search_query_t            p8est_search_query_t
+#define p4est_traverse_query_t          p8est_traverse_query_t
 #define p4est_mesh_t                    p8est_mesh_t
 #define p4est_mesh_face_neighbor_t      p8est_mesh_face_neighbor_t
 #define p4est_wrap_t                    p8est_wrap_t
 #define p4est_wrap_leaf_t               p8est_wrap_leaf_t
 #define p4est_wrap_flags_t              p8est_wrap_flags_t
+#define p4est_vtk_context_t             p8est_vtk_context_t
 
 /* redefine external variables */
 #define p4est_face_corners              p8est_face_corners
@@ -140,8 +125,10 @@
 #define p4est_connectivity_memory_used  p8est_connectivity_memory_used
 #define p4est_connectivity_new          p8est_connectivity_new
 #define p4est_connectivity_new_brick    p8est_connectivity_new_brick
+#define p4est_connectivity_new_twotrees p8est_connectivity_new_twotrees
 #define p4est_connectivity_new_byname   p8est_connectivity_new_byname
 #define p4est_connectivity_new_copy     p8est_connectivity_new_copy
+#define p4est_connectivity_bcast        p8est_connectivity_bcast
 #define p4est_connectivity_destroy      p8est_connectivity_destroy
 #define p4est_connectivity_set_attr     p8est_connectivity_set_attr
 #define p4est_connectivity_is_valid     p8est_connectivity_is_valid
@@ -168,6 +155,7 @@
 /* functions in p4est */
 #define p4est_qcoord_to_vertex          p8est_qcoord_to_vertex
 #define p4est_memory_used               p8est_memory_used
+#define p4est_revision                  p8est_revision
 #define p4est_new                       p8est_new
 #define p4est_destroy                   p8est_destroy
 #define p4est_copy                      p8est_copy
@@ -191,11 +179,13 @@
 #define p4est_replace_t                 p8est_replace_t
 #define p4est_new_ext                   p8est_new_ext
 #define p4est_mesh_new_ext              p8est_mesh_new_ext
+#define p4est_copy_ext                  p8est_copy_ext
 #define p4est_refine_ext                p8est_refine_ext
 #define p4est_coarsen_ext               p8est_coarsen_ext
 #define p4est_balance_ext               p8est_balance_ext
 #define p4est_balance_subtree_ext       p8est_balance_subtree_ext
 #define p4est_partition_ext             p8est_partition_ext
+#define p4est_partition_for_coarsening  p8est_partition_for_coarsening
 #define p4est_save_ext                  p8est_save_ext
 #define p4est_load_ext                  p8est_load_ext
 #define p4est_source_ext                p8est_source_ext
@@ -253,6 +243,7 @@
 #define p4est_quadrant_ancestor         p8est_quadrant_ancestor
 #define p4est_quadrant_parent           p8est_quadrant_parent
 #define p4est_quadrant_sibling          p8est_quadrant_sibling
+#define p4est_quadrant_child            p8est_quadrant_child
 #define p4est_quadrant_face_neighbor    p8est_quadrant_face_neighbor
 #define p4est_quadrant_face_neighbor_extra p8est_quadrant_face_neighbor_extra
 #define p4est_quadrant_half_face_neighbors p8est_quadrant_half_face_neighbors
@@ -284,6 +275,7 @@
 #define p4est_split_array               p8est_split_array
 #define p4est_find_range_boundaries     p8est_find_range_boundaries
 #define p4est_search                    p8est_search
+#define p4est_traverse                  p8est_traverse
 
 /* functions in p4est_algorithms */
 #define p4est_quadrant_init_data        p8est_quadrant_init_data
@@ -310,9 +302,18 @@
 #define p4est_partition_given           p8est_partition_given
 
 /* functions in p4est_communication */
+#define p4est_comm_parallel_env_assign  p8est_comm_parallel_env_assign
+#define p4est_comm_parallel_env_duplicate p8est_comm_parallel_env_duplicate
+#define p4est_comm_parallel_env_release p8est_comm_parallel_env_release
+#define p4est_comm_parallel_env_replace p8est_comm_parallel_env_replace
+#define p4est_comm_parallel_env_get_info p8est_comm_parallel_env_get_info
+#define p4est_comm_parallel_env_is_null p8est_comm_parallel_env_is_null
+#define p4est_comm_parallel_env_reduce  p8est_comm_parallel_env_reduce
+#define p4est_comm_parallel_env_reduce_ext p8est_comm_parallel_env_reduce_ext
 #define p4est_comm_count_quadrants      p8est_comm_count_quadrants
 #define p4est_comm_global_partition     p8est_comm_global_partition
 #define p4est_comm_count_pertree        p8est_comm_count_pertree
+#define p4est_comm_is_empty             p8est_comm_is_empty
 #define p4est_comm_is_owner             p8est_comm_is_owner
 #define p4est_comm_find_owner           p8est_comm_find_owner
 #define p4est_comm_tree_info            p8est_comm_tree_info
@@ -329,11 +330,17 @@
 #define p4est_geometry_new_connectivity p8est_geometry_new_connectivity
 
 /* functions in p4est_vtk */
+#define p4est_vtk_context_new           p8est_vtk_context_new
+#define p4est_vtk_context_destroy       p8est_vtk_context_destroy
+#define p4est_vtk_context_set_geom      p8est_vtk_context_set_geom
+#define p4est_vtk_context_set_scale     p8est_vtk_context_set_scale
+#define p4est_vtk_context_set_continuous p8est_vtk_context_set_continuous
 #define p4est_vtk_write_file            p8est_vtk_write_file
-#define p4est_vtk_write_all             p8est_vtk_write_all
 #define p4est_vtk_write_header          p8est_vtk_write_header
-#define p4est_vtk_write_point_scalar    p8est_vtk_write_point_scalar
-#define p4est_vtk_write_point_vector    p8est_vtk_write_point_vector
+#define p4est_vtk_write_cell_dataf      p8est_vtk_write_cell_dataf
+#define p4est_vtk_write_cell_data       p8est_vtk_write_cell_data
+#define p4est_vtk_write_point_dataf     p8est_vtk_write_point_dataf
+#define p4est_vtk_write_point_data      p8est_vtk_write_point_data
 #define p4est_vtk_write_footer          p8est_vtk_write_footer
 
 /* functions in p4est_ghost */
@@ -342,8 +349,16 @@
 #define p4est_ghost_new                 p8est_ghost_new
 #define p4est_ghost_destroy             p8est_ghost_destroy
 #define p4est_ghost_exchange_data       p8est_ghost_exchange_data
+#define p4est_ghost_exchange_data_begin p8est_ghost_exchange_data_begin
+#define p4est_ghost_exchange_data_end   p8est_ghost_exchange_data_end
 #define p4est_ghost_exchange_custom     p8est_ghost_exchange_custom
+#define p4est_ghost_exchange_custom_begin p8est_ghost_exchange_custom_begin
+#define p4est_ghost_exchange_custom_end p8est_ghost_exchange_custom_end
 #define p4est_ghost_exchange_custom_levels p8est_ghost_exchange_custom_levels
+#define p4est_ghost_exchange_custom_levels_begin \
+                                        p8est_ghost_exchange_custom_levels_begin
+#define p4est_ghost_exchange_custom_levels_end \
+                                        p8est_ghost_exchange_custom_levels_end
 #define p4est_ghost_bsearch             p8est_ghost_bsearch
 #define p4est_ghost_contains            p8est_ghost_contains
 #define p4est_ghost_is_valid            p8est_ghost_is_valid
@@ -363,6 +378,8 @@
 #define p4est_lnodes_destroy            p8est_lnodes_destroy
 #define p4est_ghost_support_lnodes      p8est_ghost_support_lnodes
 #define p4est_ghost_expand_by_lnodes    p8est_ghost_expand_by_lnodes
+#define p4est_partition_lnodes          p8est_partition_lnodes
+#define p4est_partition_lnodes_detailed p8est_partition_lnodes_detailed
 #define p4est_lnodes_decode             p8est_lnodes_decode
 #define p4est_lnodes_share_owned_begin  p8est_lnodes_share_owned_begin
 #define p4est_lnodes_share_owned_end    p8est_lnodes_share_owned_end
@@ -392,8 +409,14 @@
 
 /* functions in p4est_wrap */
 #define p4est_wrap_new_conn             p8est_wrap_new_conn
+#define p4est_wrap_new_p4est            p8est_wrap_new_p8est
+#define p4est_wrap_new_brick            p8est_wrap_new_brick
 #define p4est_wrap_new_world            p8est_wrap_new_world
+#define p4est_wrap_new_ext              p8est_wrap_new_ext
+#define p4est_wrap_new_copy             p8est_wrap_new_copy
 #define p4est_wrap_destroy              p8est_wrap_destroy
+#define p4est_wrap_set_hollow           p8est_wrap_set_hollow
+#define p4est_wrap_set_coarsen_delay    p8est_wrap_set_coarsen_delay
 #define p4est_wrap_get_ghost            p8est_wrap_get_ghost
 #define p4est_wrap_get_mesh             p8est_wrap_get_mesh
 #define p4est_wrap_mark_refine          p8est_wrap_mark_refine
@@ -406,4 +429,9 @@
 
 /* functions in p4est_plex */
 #define p4est_get_plex_data             p8est_get_plex_data
+#define p4est_get_plex_data_ext         p8est_get_plex_data_ext
+
+/* functions in p4est_connrefine */
+#define p4est_connectivity_refine       p8est_connectivity_refine
+
 #endif /* !P4EST_TO_P8EST_H */
diff --git a/src/p4est_vtk.c b/src/p4est_vtk.c
index b737eb7..6bb113f 100644
--- a/src/p4est_vtk.c
+++ b/src/p4est_vtk.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -31,14 +32,96 @@
 #define P4EST_VTK_CELL_TYPE      8      /* VTK_PIXEL */
 #endif /* !P4_TO_P8 */
 
-#include <sc_io.h>
-
+/* default parameters for the vtk context */
 static const double p4est_vtk_scale = 0.95;
+static const int    p4est_vtk_continuous = 0;
+
+/* default parameters for p4est_vtk_write_file */
 static const int    p4est_vtk_write_tree = 1;
 static const int    p4est_vtk_write_level = 1;
 static const int    p4est_vtk_write_rank = 1;
 static const int    p4est_vtk_wrap_rank = 0;
 
+/** Write a cell scalar field to the VTU file.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of fields.
+ * When in doubt, please use \ref p4est_vtk_write_cell_data instead.
+ *
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ * \param [in] scalar_name The name of the scalar field.
+ * \param [in] values      The cell values that will be written.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+static
+  p4est_vtk_context_t *p4est_vtk_write_cell_scalar (p4est_vtk_context_t *
+                                                    cont,
+                                                    const char *scalar_name,
+                                                    sc_array_t * values);
+
+/** Write a 3-vector cell field to the VTU file.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of fields.
+ * When in doubt, please use \ref p4est_vtk_write_cell_data instead.
+ *
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ * \param [in] vector_name The name of the vector field.
+ * \param [in] values      The cell values that will be written.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+static
+  p4est_vtk_context_t *p4est_vtk_write_cell_vector (p4est_vtk_context_t *
+                                                    cont,
+                                                    const char *vector_name,
+                                                    sc_array_t * values);
+
+/** Write a point scalar field to the VTU file.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of fields.
+ * When in doubt, please use \ref p4est_vtk_write_point_data instead.
+ *
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ * \param [in] scalar_name The name of the scalar field.
+ * \param [in] values      The point values that will be written.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+static
+  p4est_vtk_context_t *p4est_vtk_write_point_scalar (p4est_vtk_context_t *
+                                                     cont,
+                                                     const char *scalar_name,
+                                                     sc_array_t * values);
+
+/** Write a 3-vector point field to the VTU file.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of fields.
+ * When in doubt, please use \ref p4est_vtk_write_point_data instead.
+ *
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ * \param [in] vector_name The name of the vector field.
+ * \param [in] values      The point values that will be written.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+static
+  p4est_vtk_context_t *p4est_vtk_write_point_vector (p4est_vtk_context_t *
+                                                     cont,
+                                                     const char *vector_name,
+                                                     sc_array_t * values);
+
+#ifdef P4_TO_P8
+#define p4est_vtk_context               p8est_vtk_context
+#endif
+
 #ifndef P4EST_VTK_DOUBLES
 #define P4EST_VTK_FLOAT_NAME "Float32"
 #define P4EST_VTK_FLOAT_TYPE float
@@ -66,114 +149,202 @@ p4est_vtk_write_binary (FILE * vtkfile, char *numeric_data,
 
 #endif /* P4EST_VTK_BINARY */
 
+/** Opaque context type for writing VTK output with multiple function calls.
+ *
+ * This structure holds all the information needed for the p4est vtk context.
+ * It is used to relay necessary vtk information to the \b p4est_vtk_write_*
+ * functions. This structure is initialized by \ref p4est_vtk_write_header and
+ * destroyed by \b p4est_vtk_write_footer; it can also be destroyed manually
+ * using the \b p4est_vtk_context_destroy function if necessary.
+ *
+ * The \a p4est member is a pointer to the local p4est.
+ * The \a geom member is a pointer to the geometry used to create the p4est.
+ * The \a num_points member holds the number of nodes present in the vtk output;
+ * this is determined in \ref p4est_vtk_write_header using the \a scale parameter
+ * and is used to assure the proper number of point variables are provided.
+ * The \a filename member holds the vtk file basename: for error reporting.
+ * The \a vtufilename, \a pvtufilename, and \a visitfilename members are the
+ * vtk file names.
+ * The \a vtufile, \a pvtufile, and \a visitfile members are the vtk file
+ * pointers; opened by \ref p4est_vtk_write_header and closed by \b
+ * p4est_vtk_write_footer.
+ *
+ */
+struct p4est_vtk_context
+{
+  /* data passed initially */
+  p4est_t            *p4est;       /**< The p4est structure must be alive. */
+  char               *filename;    /**< Original filename provided is copied. */
+
+  /* parameters that can optionally be set in a context */
+  p4est_geometry_t   *geom;        /**< The geometry may be NULL. */
+  double              scale;       /**< Parameter to shrink quadrants. */
+  int                 continuous;  /**< Assume continuous point data? */
+
+  /* internal context data */
+  int                 writing;     /**< True after p4est_vtk_write_header. */
+  p4est_locidx_t      num_corners; /**< Number of local element corners. */
+  p4est_locidx_t      num_points;  /**< Number of VTK points written. */
+  p4est_locidx_t     *node_to_corner;     /**< Map a node to an element corner. */
+  p4est_nodes_t      *nodes;       /**< NULL? depending on scale/continuous. */
+  char                vtufilename[BUFSIZ];   /**< Each process writes one. */
+  char                pvtufilename[BUFSIZ];  /**< Only root writes this one. */
+  char                visitfilename[BUFSIZ]; /**< Only root writes this one. */
+  FILE               *vtufile;     /**< File pointer for the VTU file. */
+  FILE               *pvtufile;    /**< Paraview meta file. */
+  FILE               *visitfile;   /**< Visit meta file. */
+};
+
+p4est_vtk_context_t *
+p4est_vtk_context_new (p4est_t * p4est, const char *filename)
+{
+  p4est_vtk_context_t *cont;
+
+  P4EST_ASSERT (p4est != NULL);
+  P4EST_ASSERT (filename != NULL);
+
+  /* Allocate, initialize the vtk context.  Important to zero all fields. */
+  cont = P4EST_ALLOC_ZERO (p4est_vtk_context_t, 1);
+
+  cont->p4est = p4est;
+  cont->filename = P4EST_STRDUP (filename);
+
+  cont->scale = p4est_vtk_scale;
+  cont->continuous = p4est_vtk_continuous;
+
+  return cont;
+}
+
 void
-p4est_vtk_write_file (p4est_t * p4est, p4est_geometry_t * geom,
-                      const char *filename)
+p4est_vtk_context_set_geom (p4est_vtk_context_t * cont,
+                            p4est_geometry_t * geom)
 {
-  p4est_vtk_write_all (p4est, geom,
-                       p4est_vtk_scale,
-                       p4est_vtk_write_tree, p4est_vtk_write_level,
-                       p4est_vtk_write_rank, p4est_vtk_wrap_rank,
-                       0, 0, filename);
+  P4EST_ASSERT (cont != NULL);
+  P4EST_ASSERT (!cont->writing);
+
+  cont->geom = geom;
 }
 
 void
-p4est_vtk_write_all (p4est_t * p4est, p4est_geometry_t * geom,
-                     double scale,
-                     int write_tree, int write_level,
-                     int write_rank, int wrap_rank,
-                     int num_scalars, int num_vectors,
-                     const char *filename, ...)
+p4est_vtk_context_set_scale (p4est_vtk_context_t * cont, double scale)
 {
-  int                 retval;
-  int                 i, all;
-  int                 scalar_strlen, vector_strlen;
-  char                point_scalars[BUFSIZ], point_vectors[BUFSIZ];
-  const char         *name, **names;
-  double            **values;
-  va_list             ap;
+  P4EST_ASSERT (cont != NULL);
+  P4EST_ASSERT (!cont->writing);
+  P4EST_ASSERT (0. < scale && scale <= 1.);
+
+  cont->scale = scale;
+}
 
-  P4EST_ASSERT (num_scalars >= 0 && num_vectors >= 0);
+void
+p4est_vtk_context_set_continuous (p4est_vtk_context_t * cont, int continuous)
+{
+  P4EST_ASSERT (cont != NULL);
+  P4EST_ASSERT (!cont->writing);
 
-  values = P4EST_ALLOC (double *, num_scalars + num_vectors);
-  names = P4EST_ALLOC (const char *, num_scalars + num_vectors);
+  cont->continuous = continuous;
+}
 
-  va_start (ap, filename);
-  all = 0;
-  scalar_strlen = 0;
-  point_scalars[0] = '\0';
-  for (i = 0; i < num_scalars; ++all, ++i) {
-    name = names[all] = va_arg (ap, const char *);
-    retval = snprintf (point_scalars + scalar_strlen, BUFSIZ - scalar_strlen,
-                       "%s%s", i == 0 ? "" : ",", name);
-    SC_CHECK_ABORT (retval > 0,
-                    P4EST_STRING "_vtk: Error collecting point scalars");
-    scalar_strlen += retval;
-    values[all] = va_arg (ap, double *);
-  }
-  vector_strlen = 0;
-  point_vectors[0] = '\0';
-  for (i = 0; i < num_vectors; ++all, ++i) {
-    name = names[all] = va_arg (ap, const char *);
-    retval = snprintf (point_vectors + vector_strlen, BUFSIZ - vector_strlen,
-                       "%s%s", i == 0 ? "" : ",", name);
-    SC_CHECK_ABORT (retval > 0,
-                    P4EST_STRING "_vtk: Error collecting point vectors");
-    vector_strlen += retval;
-    values[all] = va_arg (ap, double *);
+void
+p4est_vtk_context_destroy (p4est_vtk_context_t * context)
+{
+  P4EST_ASSERT (context != NULL);
+  P4EST_ASSERT (context->p4est != NULL);
+
+  /* since this function is called inside write_header and write_footer,
+   * we cannot assume a consistent state of all member variables */
+
+  P4EST_ASSERT (context->filename != NULL);
+  P4EST_FREE (context->filename);
+
+  /* deallocate node storage */
+  if (context->nodes != NULL) {
+    p4est_nodes_destroy (context->nodes);
   }
-  va_end (ap);
+  P4EST_FREE (context->node_to_corner);
 
-  retval = p4est_vtk_write_header (p4est, geom, scale,
-                                   write_tree, write_level,
-                                   write_rank, wrap_rank,
-                                   num_scalars > 0 ? point_scalars : NULL,
-                                   num_vectors > 0 ? point_vectors : NULL,
-                                   filename);
-  SC_CHECK_ABORT (!retval, P4EST_STRING "_vtk: Error writing header");
+  /* Close all file pointers. */
+  if (context->vtufile != NULL) {
+    if (fclose (context->vtufile)) {
+      P4EST_LERRORF (P4EST_STRING "_vtk: Error closing <%s>.\n",
+                     context->vtufilename);
+    }
+    context->vtufile = NULL;
+  }
 
-  all = 0;
-  for (i = 0; i < num_scalars; ++all, ++i) {
-    retval = p4est_vtk_write_point_scalar (p4est, geom, filename,
-                                           names[all], values[all]);
-    SC_CHECK_ABORT (!retval,
-                    P4EST_STRING "_vtk: Error writing point scalars");
+  /* Close paraview master file */
+  if (context->pvtufile != NULL) {
+    /* Only the root process opens/closes these files. */
+    P4EST_ASSERT (context->p4est->mpirank == 0);
+    if (fclose (context->pvtufile)) {
+      P4EST_LERRORF (P4EST_STRING "_vtk: Error closing <%s>.\n",
+                     context->pvtufilename);
+    }
+    context->pvtufile = NULL;
   }
-  for (i = 0; i < num_vectors; ++all, ++i) {
-    retval = p4est_vtk_write_point_vector (p4est, geom, filename,
-                                           names[all], values[all]);
-    SC_CHECK_ABORT (!retval,
-                    P4EST_STRING "_vtk: Error writing point vectors");
+
+  /* Close visit master file */
+  if (context->visitfile != NULL) {
+    /* Only the root process opens/closes these files. */
+    P4EST_ASSERT (context->p4est->mpirank == 0);
+    if (fclose (context->visitfile)) {
+      P4EST_LERRORF (P4EST_STRING "_vtk: Error closing <%s>.\n",
+                     context->visitfilename);
+    }
+    context->visitfile = NULL;
   }
 
-  retval = p4est_vtk_write_footer (p4est, filename);
-  SC_CHECK_ABORT (!retval, P4EST_STRING "_vtk: Error writing footer");
+  /* Free context structure. */
+  P4EST_FREE (context);
+}
 
-  P4EST_FREE (values);
-  P4EST_FREE (names);
+void
+p4est_vtk_write_file (p4est_t * p4est, p4est_geometry_t * geom,
+                      const char *filename)
+{
+  int                 retval;
+  p4est_vtk_context_t *cont;
+
+  /* allocate context and set parameters */
+  cont = p4est_vtk_context_new (p4est, filename);
+  p4est_vtk_context_set_geom (cont, geom);
+
+  /* We do not write point data, so it is safe to set continuous to true.
+   * This will not save any space though since the default scale is < 1. */
+  p4est_vtk_context_set_continuous (cont, 1);
+
+  /* write header, that is, vertex positions and quadrant-to-vertex map */
+  cont = p4est_vtk_write_header (cont);
+  SC_CHECK_ABORT (cont != NULL, P4EST_STRING "_vtk: Error writing header");
+
+  /* write the tree/level/rank data */
+  cont =
+    p4est_vtk_write_cell_dataf (cont, p4est_vtk_write_tree,
+                                p4est_vtk_write_level, p4est_vtk_write_rank,
+                                p4est_vtk_wrap_rank, 0, 0, cont);
+  SC_CHECK_ABORT (cont != NULL, P4EST_STRING "_vtk: Error writing cell data");
+
+  /* properly write rest of the files' contents */
+  retval = p4est_vtk_write_footer (cont);
+  SC_CHECK_ABORT (!retval, P4EST_STRING "_vtk: Error writing footer");
 }
 
-int
-p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
-                        double scale,
-                        int write_tree, int write_level,
-                        int write_rank, int wrap_rank,
-                        const char *point_scalars, const char *point_vectors,
-                        const char *filename)
+p4est_vtk_context_t *
+p4est_vtk_write_header (p4est_vtk_context_t * cont)
 {
-  p4est_connectivity_t *connectivity = p4est->connectivity;
-  sc_array_t         *trees = p4est->trees;
-  const int           mpirank = p4est->mpirank;
   const double        intsize = 1.0 / P4EST_ROOT_LEN;
-  const double       *v = connectivity->vertices;
-  const p4est_topidx_t first_local_tree = p4est->first_local_tree;
-  const p4est_topidx_t last_local_tree = p4est->last_local_tree;
-  const p4est_topidx_t *tree_to_vertex = connectivity->tree_to_vertex;
-  const p4est_locidx_t Ncells = p4est->local_num_quadrants;
-  const p4est_locidx_t Ncorners = P4EST_CHILDREN * Ncells;
+  int                 mpirank;
+  int                 conti;
+  double              scale;
+  const char         *filename;
+  const double       *v;
+  const p4est_topidx_t *tree_to_vertex;
+  p4est_topidx_t      first_local_tree, last_local_tree;
+  p4est_locidx_t      Ncells, Ncorners;
+  p4est_t            *p4est;
+  p4est_connectivity_t *connectivity;
+  p4est_geometry_t   *geom;
 #ifdef P4EST_VTK_ASCII
   double              wx, wy, wz;
-  p4est_locidx_t      sk;
 #else
   int                 retval;
   uint8_t            *uint8_data;
@@ -188,68 +359,125 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
   size_t              num_quads, zz;
   p4est_topidx_t      jt;
   p4est_topidx_t      vt[P4EST_CHILDREN];
-  p4est_locidx_t      quad_count, Ntotal;
-  p4est_locidx_t      il;
+  p4est_locidx_t      quad_count, Npoints;
+  p4est_locidx_t      sk, il, ntcid, *ntc;
   P4EST_VTK_FLOAT_TYPE *float_data;
   sc_array_t         *quadrants, *indeps;
+  sc_array_t         *trees;
   p4est_tree_t       *tree;
   p4est_quadrant_t   *quad;
   p4est_nodes_t      *nodes;
   p4est_indep_t      *in;
-  char                vtufilename[BUFSIZ];
-  FILE               *vtufile;
 
-  SC_CHECK_ABORT (p4est->connectivity->num_vertices > 0,
-                  "Must provide connectivity with vertex information");
+  /* check a whole bunch of assertions, here and below */
+  P4EST_ASSERT (cont != NULL);
+  P4EST_ASSERT (!cont->writing);
+
+  /* avoid uninitialized warning */
+  for (k = 0; k < P4EST_CHILDREN; ++k) {
+    vt[k] = -(k + 1);
+  }
 
-  P4EST_ASSERT (0. <= scale && scale <= 1. && wrap_rank >= 0);
-  P4EST_ASSERT (v != NULL && tree_to_vertex != NULL);
+  /* from now on this context is officially in use for writing */
+  cont->writing = 1;
+
+  /* grab context variables */
+  p4est = cont->p4est;
+  filename = cont->filename;
+  geom = cont->geom;
+  scale = cont->scale;
+  conti = cont->continuous;
+  P4EST_ASSERT (filename != NULL);
+
+  /* grab details from the forest */
+  P4EST_ASSERT (p4est != NULL);
+  mpirank = p4est->mpirank;
+  connectivity = p4est->connectivity;
+  P4EST_ASSERT (connectivity != NULL);
+  v = connectivity->vertices;
+  tree_to_vertex = connectivity->tree_to_vertex;
+  if (geom == NULL) {
+    SC_CHECK_ABORT (connectivity->num_vertices > 0,
+                    "Must provide connectivity with vertex information");
+    P4EST_ASSERT (v != NULL && tree_to_vertex != NULL);
+  }
+  trees = p4est->trees;
+  first_local_tree = p4est->first_local_tree;
+  last_local_tree = p4est->last_local_tree;
+  Ncells = p4est->local_num_quadrants;
 
-  if (scale < 1.) {
+  cont->num_corners = Ncorners = P4EST_CHILDREN * Ncells;
+  if (scale < 1. || !conti) {
     /* when we scale the quadrants we need each corner separately */
-    nodes = NULL;
+    cont->nodes = nodes = NULL;
+    cont->num_points = Npoints = Ncorners;
+    cont->node_to_corner = ntc = NULL;
     indeps = NULL;
-    Ntotal = Ncorners;
   }
   else {
-    /* when scale == 1. we can reuse shared quadrant corners */
-    nodes = p4est_nodes_new (p4est, NULL);
+    /* if scale == 1. and the point data is continuous,
+     * we can reuse shared quadrant corners */
+    cont->nodes = nodes = p4est_nodes_new (p4est, NULL);
     indeps = &nodes->indep_nodes;
-    Ntotal = nodes->num_owned_indeps;
-    P4EST_ASSERT ((size_t) Ntotal == indeps->elem_count);
+    cont->num_points = Npoints = nodes->num_owned_indeps;
+    P4EST_ASSERT ((size_t) Npoints == indeps->elem_count);
+
+    /* Establish a reverse lookup table from a node to its first reference.
+     * It is slow to run twice through memory like this.  However, we also know
+     * that writing data to disk is slower still, so we do not optimize.
+     */
+    cont->node_to_corner = ntc = P4EST_ALLOC (p4est_locidx_t, Npoints);
+    memset (ntc, -1, Npoints * sizeof (p4est_locidx_t));
+    for (sk = 0, il = 0; il < Ncells; ++il) {
+      for (k = 0; k < P4EST_CHILDREN; ++sk, ++k) {
+        ntcid = nodes->local_nodes[sk];
+        P4EST_ASSERT (0 <= ntcid && ntcid < Npoints);
+        if (ntc[ntcid] < 0) {
+          ntc[ntcid] = sk;
+        }
+      }
+    }
+#ifdef P4EST_ENABLE_DEBUG
+    /* the particular version of nodes we call makes sure they are tight */
+    for (ntcid = 0; ntcid < Npoints; ++ntcid) {
+      P4EST_ASSERT (0 <= ntc[ntcid] && ntc[ntcid] < Ncorners);
+    }
+#endif
   }
 
   /* Have each proc write to its own file */
-  snprintf (vtufilename, BUFSIZ, "%s_%04d.vtu", filename, mpirank);
+  snprintf (cont->vtufilename, BUFSIZ, "%s_%04d.vtu", filename, mpirank);
   /* Use "w" for writing the initial part of the file.
    * For further parts, use "r+" and fseek so write_compressed succeeds.
    */
-  vtufile = fopen (vtufilename, "wb");
-  if (vtufile == NULL) {
-    P4EST_LERRORF ("Could not open %s for output\n", vtufilename);
-    return -1;
+  cont->vtufile = fopen (cont->vtufilename, "wb");
+  if (cont->vtufile == NULL) {
+    P4EST_LERRORF ("Could not open %s for output\n", cont->vtufilename);
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
 
-  fprintf (vtufile, "<?xml version=\"1.0\"?>\n");
-  fprintf (vtufile, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\"");
+  fprintf (cont->vtufile, "<?xml version=\"1.0\"?>\n");
+  fprintf (cont->vtufile,
+           "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\"");
 #if defined P4EST_VTK_BINARY && defined P4EST_VTK_COMPRESSION
-  fprintf (vtufile, " compressor=\"vtkZLibDataCompressor\"");
+  fprintf (cont->vtufile, " compressor=\"vtkZLibDataCompressor\"");
 #endif
 #ifdef SC_IS_BIGENDIAN
-  fprintf (vtufile, " byte_order=\"BigEndian\">\n");
+  fprintf (cont->vtufile, " byte_order=\"BigEndian\">\n");
 #else
-  fprintf (vtufile, " byte_order=\"LittleEndian\">\n");
+  fprintf (cont->vtufile, " byte_order=\"LittleEndian\">\n");
 #endif
-  fprintf (vtufile, "  <UnstructuredGrid>\n");
-  fprintf (vtufile,
+  fprintf (cont->vtufile, "  <UnstructuredGrid>\n");
+  fprintf (cont->vtufile,
            "    <Piece NumberOfPoints=\"%lld\" NumberOfCells=\"%lld\">\n",
-           (long long) Ntotal, (long long) Ncells);
-  fprintf (vtufile, "      <Points>\n");
+           (long long) Npoints, (long long) Ncells);
+  fprintf (cont->vtufile, "      <Points>\n");
 
-  float_data = P4EST_ALLOC (P4EST_VTK_FLOAT_TYPE, 3 * Ntotal);
+  float_data = P4EST_ALLOC (P4EST_VTK_FLOAT_TYPE, 3 * Npoints);
 
   /* write point position data */
-  fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"Position\""
+  fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"Position\""
            " NumberOfComponents=\"3\" format=\"%s\">\n",
            P4EST_VTK_FLOAT_NAME, P4EST_VTK_FORMAT_STRING);
 
@@ -261,10 +489,18 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
       num_quads = quadrants->elem_count;
 
       /* retrieve corners of the tree */
-      for (k = 0; k < P4EST_CHILDREN; ++k)
-        vt[k] = tree_to_vertex[jt * P4EST_CHILDREN + k];
+      if (geom == NULL) {
+        for (k = 0; k < P4EST_CHILDREN; ++k) {
+          vt[k] = tree_to_vertex[jt * P4EST_CHILDREN + k];
+        }
+      }
+      else {
+        /* provoke crash on logic bug */
+        P4EST_ASSERT (vt[0] == -1);
+        v = NULL;
+      }
 
-      /* loop over the elements in tree and calculated vertex coordinates */
+      /* loop over the elements in tree and calculate vertex coordinates */
       for (zz = 0; zz < num_quads; ++zz, ++quad_count) {
         quad = p4est_quadrant_array_index (quadrants, zz);
         h2 = .5 * intsize * P4EST_QUADRANT_LEN (quad->level);
@@ -317,7 +553,7 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
         P4EST_ASSERT (k == P4EST_CHILDREN);
       }
     }
-    P4EST_ASSERT (P4EST_CHILDREN * quad_count == Ntotal);
+    P4EST_ASSERT (P4EST_CHILDREN * quad_count == Npoints);
   }
   else {
     for (zz = 0; zz < indeps->elem_count; ++zz) {
@@ -325,8 +561,16 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
 
       /* retrieve corners of the tree */
       jt = in->p.which_tree;
-      for (k = 0; k < P4EST_CHILDREN; ++k)
-        vt[k] = tree_to_vertex[jt * P4EST_CHILDREN + k];
+      if (geom == NULL) {
+        for (k = 0; k < P4EST_CHILDREN; ++k) {
+          vt[k] = tree_to_vertex[jt * P4EST_CHILDREN + k];
+        }
+      }
+      else {
+        /* provoke crash on logic bug */
+        P4EST_ASSERT (vt[0] == -1);
+        v = NULL;
+      }
 
       /* calculate vertex coordinates */
       eta_x = intsize * in->x;
@@ -367,164 +611,568 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
   }
 
 #ifdef P4EST_VTK_ASCII
-  for (il = 0; il < Ntotal; ++il) {
+  for (il = 0; il < Npoints; ++il) {
     wx = float_data[3 * il + 0];
     wy = float_data[3 * il + 1];
     wz = float_data[3 * il + 2];
 
+    fprintf (cont->vtufile,
 #ifdef P4EST_VTK_DOUBLES
-    fprintf (vtufile, "     %24.16e %24.16e %24.16e\n", wx, wy, wz);
+             "     %24.16e %24.16e %24.16e\n",
 #else
-    fprintf (vtufile, "          %16.8e %16.8e %16.8e\n", wx, wy, wz);
+             "          %16.8e %16.8e %16.8e\n",
 #endif
+             wx, wy, wz);
   }
 #else
-  fprintf (vtufile, "          ");
+  fprintf (cont->vtufile, "          ");
   /* TODO: Don't allocate the full size of the array, only allocate
    * the chunk that will be passed to zlib and do this a chunk
    * at a time.
    */
-  retval = p4est_vtk_write_binary (vtufile, (char *) float_data,
-                                   sizeof (*float_data) * 3 * Ntotal);
-  fprintf (vtufile, "\n");
+  retval = p4est_vtk_write_binary (cont->vtufile, (char *) float_data,
+                                   sizeof (*float_data) * 3 * Npoints);
+  fprintf (cont->vtufile, "\n");
   if (retval) {
     P4EST_LERROR (P4EST_STRING "_vtk: Error encoding points\n");
-    fclose (vtufile);
-    return -1;
+    p4est_vtk_context_destroy (cont);
+    P4EST_FREE (float_data);
+    return NULL;
   }
 #endif
   P4EST_FREE (float_data);
-  fprintf (vtufile, "        </DataArray>\n");
-  fprintf (vtufile, "      </Points>\n");
-  fprintf (vtufile, "      <Cells>\n");
+
+  fprintf (cont->vtufile, "        </DataArray>\n");
+  fprintf (cont->vtufile, "      </Points>\n");
+  fprintf (cont->vtufile, "      <Cells>\n");
 
   /* write connectivity data */
-  fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"connectivity\""
+  fprintf (cont->vtufile,
+           "        <DataArray type=\"%s\" Name=\"connectivity\""
            " format=\"%s\">\n", P4EST_VTK_LOCIDX, P4EST_VTK_FORMAT_STRING);
 #ifdef P4EST_VTK_ASCII
   for (sk = 0, il = 0; il < Ncells; ++il) {
-    fprintf (vtufile, "         ");
+    fprintf (cont->vtufile, "         ");
     for (k = 0; k < P4EST_CHILDREN; ++sk, ++k) {
-      fprintf (vtufile, " %lld", nodes == NULL ?
+      fprintf (cont->vtufile, " %lld", nodes == NULL ?
                (long long) sk : (long long) nodes->local_nodes[sk]);
     }
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
   }
 #else
-  locidx_data = P4EST_ALLOC (p4est_locidx_t, Ncorners);
-  fprintf (vtufile, "          ");
+  fprintf (cont->vtufile, "          ");
   if (nodes == NULL) {
+    locidx_data = P4EST_ALLOC (p4est_locidx_t, Ncorners);
     for (il = 0; il < Ncorners; ++il) {
       locidx_data[il] = il;
     }
     retval =
-      p4est_vtk_write_binary (vtufile, (char *) locidx_data,
-                              sizeof (*locidx_data) * Ncorners);
+      p4est_vtk_write_binary (cont->vtufile, (char *) locidx_data,
+                              sizeof (p4est_locidx_t) * Ncorners);
+    P4EST_FREE (locidx_data);
   }
   else {
     retval =
-      p4est_vtk_write_binary (vtufile, (char *) nodes->local_nodes,
-                              sizeof (*locidx_data) * Ncorners);
+      p4est_vtk_write_binary (cont->vtufile, (char *) nodes->local_nodes,
+                              sizeof (p4est_locidx_t) * Ncorners);
   }
-  fprintf (vtufile, "\n");
+  fprintf (cont->vtufile, "\n");
   if (retval) {
     P4EST_LERROR (P4EST_STRING "_vtk: Error encoding connectivity\n");
-    fclose (vtufile);
-    return -1;
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
 #endif
-  fprintf (vtufile, "        </DataArray>\n");
+  fprintf (cont->vtufile, "        </DataArray>\n");
 
   /* write offset data */
-  fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"offsets\""
+  fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"offsets\""
            " format=\"%s\">\n", P4EST_VTK_LOCIDX, P4EST_VTK_FORMAT_STRING);
 #ifdef P4EST_VTK_ASCII
-  fprintf (vtufile, "         ");
+  fprintf (cont->vtufile, "         ");
   for (il = 1, sk = 1; il <= Ncells; ++il, ++sk) {
-    fprintf (vtufile, " %lld", (long long) (P4EST_CHILDREN * il));
+    fprintf (cont->vtufile, " %lld", (long long) (P4EST_CHILDREN * il));
     if (!(sk % 8) && il != Ncells)
-      fprintf (vtufile, "\n         ");
+      fprintf (cont->vtufile, "\n         ");
   }
-  fprintf (vtufile, "\n");
+  fprintf (cont->vtufile, "\n");
 #else
+  locidx_data = P4EST_ALLOC (p4est_locidx_t, Ncells);
   for (il = 1; il <= Ncells; ++il)
     locidx_data[il - 1] = P4EST_CHILDREN * il;  /* same type */
 
-  fprintf (vtufile, "          ");
-  retval = p4est_vtk_write_binary (vtufile, (char *) locidx_data,
-                                   sizeof (*locidx_data) * Ncells);
-  fprintf (vtufile, "\n");
+  fprintf (cont->vtufile, "          ");
+  retval = p4est_vtk_write_binary (cont->vtufile, (char *) locidx_data,
+                                   sizeof (p4est_locidx_t) * Ncells);
+  fprintf (cont->vtufile, "\n");
+
+  P4EST_FREE (locidx_data);
+
   if (retval) {
     P4EST_LERROR (P4EST_STRING "_vtk: Error encoding offsets\n");
-    fclose (vtufile);
-    return -1;
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
 #endif
-  fprintf (vtufile, "        </DataArray>\n");
+  fprintf (cont->vtufile, "        </DataArray>\n");
 
   /* write type data */
-  fprintf (vtufile, "        <DataArray type=\"UInt8\" Name=\"types\""
+  fprintf (cont->vtufile, "        <DataArray type=\"UInt8\" Name=\"types\""
            " format=\"%s\">\n", P4EST_VTK_FORMAT_STRING);
 #ifdef P4EST_VTK_ASCII
-  fprintf (vtufile, "         ");
+  fprintf (cont->vtufile, "         ");
   for (il = 0, sk = 1; il < Ncells; ++il, ++sk) {
-    fprintf (vtufile, " %d", P4EST_VTK_CELL_TYPE);
+    fprintf (cont->vtufile, " %d", P4EST_VTK_CELL_TYPE);
     if (!(sk % 20) && il != (Ncells - 1))
-      fprintf (vtufile, "\n         ");
+      fprintf (cont->vtufile, "\n         ");
   }
-  fprintf (vtufile, "\n");
+  fprintf (cont->vtufile, "\n");
 #else
   uint8_data = P4EST_ALLOC (uint8_t, Ncells);
   for (il = 0; il < Ncells; ++il)
     uint8_data[il] = P4EST_VTK_CELL_TYPE;
 
-  fprintf (vtufile, "          ");
-  retval = p4est_vtk_write_binary (vtufile, (char *) uint8_data,
+  fprintf (cont->vtufile, "          ");
+  retval = p4est_vtk_write_binary (cont->vtufile, (char *) uint8_data,
                                    sizeof (*uint8_data) * Ncells);
-  fprintf (vtufile, "\n");
+  fprintf (cont->vtufile, "\n");
+
+  P4EST_FREE (uint8_data);
+
   if (retval) {
     P4EST_LERROR (P4EST_STRING "_vtk: Error encoding types\n");
-    fclose (vtufile);
-    return -1;
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
 #endif
-  fprintf (vtufile, "        </DataArray>\n");
-  fprintf (vtufile, "      </Cells>\n");
+  fprintf (cont->vtufile, "        </DataArray>\n");
+  fprintf (cont->vtufile, "      </Cells>\n");
 
-  if (write_tree || write_level || write_rank) {
-    char                vtkCellDataString[BUFSIZ] = "";
-    int                 printed = 0;
+  if (ferror (cont->vtufile)) {
+    P4EST_LERROR (P4EST_STRING "_vtk: Error writing header\n");
+    p4est_vtk_context_destroy (cont);
+    return NULL;
+  }
 
-    if (write_tree)
-      printed +=
-        snprintf (vtkCellDataString + printed, BUFSIZ - printed, "treeid");
-    if (write_level)
-      printed +=
-        snprintf (vtkCellDataString + printed, BUFSIZ - printed,
-                  printed > 0 ? ",level" : "level");
-    if (write_rank)
-      printed +=
-        snprintf (vtkCellDataString + printed, BUFSIZ - printed,
-                  printed > 0 ? ",mpirank" : "mpirank");
+  /* Only have the root write to the parallel vtk file */
+  if (mpirank == 0) {
+    snprintf (cont->pvtufilename, BUFSIZ, "%s.pvtu", filename);
+
+    cont->pvtufile = fopen (cont->pvtufilename, "wb");
+    if (!cont->pvtufile) {
+      P4EST_LERRORF ("Could not open %s for output\n", cont->pvtufilename);
+      p4est_vtk_context_destroy (cont);
+      return NULL;
+    }
+
+    fprintf (cont->pvtufile, "<?xml version=\"1.0\"?>\n");
+    fprintf (cont->pvtufile,
+             "<VTKFile type=\"PUnstructuredGrid\" version=\"0.1\"");
+#if defined P4EST_VTK_BINARY && defined P4EST_VTK_COMPRESSION
+    fprintf (cont->pvtufile, " compressor=\"vtkZLibDataCompressor\"");
+#endif
+#ifdef SC_IS_BIGENDIAN
+    fprintf (cont->pvtufile, " byte_order=\"BigEndian\">\n");
+#else
+    fprintf (cont->pvtufile, " byte_order=\"LittleEndian\">\n");
+#endif
+
+    fprintf (cont->pvtufile, "  <PUnstructuredGrid GhostLevel=\"0\">\n");
+    fprintf (cont->pvtufile, "    <PPoints>\n");
+    fprintf (cont->pvtufile, "      <PDataArray type=\"%s\" Name=\"Position\""
+             " NumberOfComponents=\"3\" format=\"%s\"/>\n",
+             P4EST_VTK_FLOAT_NAME, P4EST_VTK_FORMAT_STRING);
+    fprintf (cont->pvtufile, "    </PPoints>\n");
+
+    if (ferror (cont->pvtufile)) {
+      P4EST_LERROR (P4EST_STRING "_vtk: Error writing parallel header\n");
+      p4est_vtk_context_destroy (cont);
+      return NULL;
+    }
+
+    /* Create a master file for visualization in Visit; this will be used
+     * only in p4est_vtk_write_footer().
+     */
+    snprintf (cont->visitfilename, BUFSIZ, "%s.visit", filename);
+    cont->visitfile = fopen (cont->visitfilename, "wb");
+    if (!cont->visitfile) {
+      P4EST_LERRORF ("Could not open %s for output\n", cont->visitfilename);
+      p4est_vtk_context_destroy (cont);
+      return NULL;
+    }
+  }
+
+  /* the nodes object is no longer needed */
+  if (nodes != NULL) {
+    p4est_nodes_destroy (cont->nodes);
+    cont->nodes = NULL;
+  }
+
+  return cont;
+}
+
+/** Write VTK point data.
+ *
+ * This function exports custom point data to the vtk file; it is functionally
+ * the same as \b p4est_vtk_write_point_dataf with the only difference being
+ * that instead of a variable argument list, an initialized \a va_list is
+ * passed as the last argument. The \a va_list is initialized from the variable
+ * argument list of the calling function.
+ *
+ * \note This function is actually called from \b p4est_vtk_write_point_dataf
+ * and does all of the work.
+ *
+ * \param [in,out] cont    A vtk context created by \ref p4est_vtk_context_new.
+ * \param [in] num_point_scalars Number of point scalar datasets to output.
+ * \param [in] num_point_vectors Number of point vector datasets to output.
+ * \param [in,out] ap      An initialized va_list used to access the
+ *                         scalar/vector data.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+static p4est_vtk_context_t *
+p4est_vtk_write_point_datav (p4est_vtk_context_t * cont,
+                             int num_point_scalars,
+                             int num_point_vectors, va_list ap)
+{
+  const int           num_point_all = num_point_scalars + num_point_vectors;
+  int                 mpirank;
+  int                 retval;
+  int                 i, all;
+  int                 scalar_strlen, vector_strlen;
+  char                point_scalars[BUFSIZ], point_vectors[BUFSIZ];
+  const char         *name, **names;
+  p4est_vtk_context_t *list_end;
+  sc_array_t        **values;
+
+  P4EST_ASSERT (cont != NULL && cont->writing);
+  P4EST_ASSERT (cont->p4est != NULL);
+
+  /* This function needs to do nothing if there is no data. */
+  if (!(num_point_scalars || num_point_vectors)) {
+    return cont;
+  }
+  mpirank = cont->p4est->mpirank;
+
+  /* Allocate storage to manage the data fields. */
+  values = P4EST_ALLOC (sc_array_t *, num_point_all);
+  names = P4EST_ALLOC (const char *, num_point_all);
+
+  /* Gather point data. */
+  all = 0;
+  scalar_strlen = 0;
+  point_scalars[0] = '\0';
+  for (i = 0; i < num_point_scalars; ++all, ++i) {
+    name = names[all] = va_arg (ap, const char *);
+    retval = snprintf (point_scalars + scalar_strlen, BUFSIZ - scalar_strlen,
+                       "%s%s", i == 0 ? "" : ",", name);
+    SC_CHECK_ABORT (retval > 0,
+                    P4EST_STRING "_vtk: Error collecting point scalars");
+    scalar_strlen += retval;
+    values[all] = va_arg (ap, sc_array_t *);
+
+    /* Validate input. */
+    SC_CHECK_ABORT (values[all]->elem_size == sizeof (double),
+                    P4EST_STRING
+                    "_vtk: Error: incorrect point scalar data type;"
+                    " scalar data must contain doubles.");
+    SC_CHECK_ABORT (values[all]->elem_count == (size_t) cont->num_corners,
+                    P4EST_STRING
+                    "_vtk: Error: incorrect point scalar data count; see "
+                    P4EST_STRING "_vtk.h for more details.");
+  }
+
+  /* keep variable all at current value */
+  vector_strlen = 0;
+  point_vectors[0] = '\0';
+  for (i = 0; i < num_point_vectors; ++all, ++i) {
+    name = names[all] = va_arg (ap, const char *);
+    retval = snprintf (point_vectors + vector_strlen, BUFSIZ - vector_strlen,
+                       "%s%s", i == 0 ? "" : ",", name);
+    SC_CHECK_ABORT (retval > 0,
+                    P4EST_STRING "_vtk: Error collecting point vectors");
+    vector_strlen += retval;
+    values[all] = va_arg (ap, sc_array_t *);
+
+    /* Validate input. */
+    SC_CHECK_ABORT (values[all]->elem_size == sizeof (double),
+                    P4EST_STRING
+                    "_vtk: Error: incorrect point vector data type;"
+                    " vector data must contain doubles.");
+    SC_CHECK_ABORT (values[all]->elem_count == 3 * (size_t) cont->num_corners,
+                    P4EST_STRING
+                    "_vtk: Error: incorrect point vector data count; see "
+                    P4EST_STRING "_vtk.h for more details.");
+  }
+
+  /* Check for pointer variable marking the end of variable data input. */
+  list_end = va_arg (ap, p4est_vtk_context_t *);
+  SC_CHECK_ABORT (list_end == cont,
+                  P4EST_STRING "_vtk Error: the end of variable data must be"
+                  " specified by passing, as the last argument, the current "
+                  P4EST_STRING "_vtk_context_t pointer.  See " P4EST_STRING
+                  "_vtk.h for more information.");
+
+  fprintf (cont->vtufile, "      <PointData");
+  if (point_scalars != NULL)
+    fprintf (cont->vtufile, " Scalars=\"%s\"", point_scalars);
+  if (point_vectors != NULL)
+    fprintf (cont->vtufile, " Vectors=\"%s\"", point_vectors);
+  fprintf (cont->vtufile, ">\n");
+
+  if (ferror (cont->vtufile)) {
+    P4EST_LERRORF (P4EST_STRING "_vtk: Error writing %s\n",
+                   cont->vtufilename);
+    p4est_vtk_context_destroy (cont);
+
+    P4EST_FREE (values);
+    P4EST_FREE (names);
+
+    return NULL;
+  }
+
+  /* now we are done counting and checking we write the fields */
+  all = 0;
+  for (i = 0; i < num_point_scalars; ++all, ++i) {
+    cont = p4est_vtk_write_point_scalar (cont, names[all], values[all]);
+    SC_CHECK_ABORT (cont != NULL,
+                    P4EST_STRING "_vtk: Error writing point scalars");
+  }
+
+  for (i = 0; i < num_point_vectors; ++all, ++i) {
+    cont = p4est_vtk_write_point_vector (cont, names[all], values[all]);
+    SC_CHECK_ABORT (cont != NULL,
+                    P4EST_STRING "_vtk: Error writing point vectors");
+  }
+
+  fprintf (cont->vtufile, "      </PointData>\n");
+
+  P4EST_FREE (values);
+
+  if (ferror (cont->vtufile)) {
+    P4EST_LERRORF (P4EST_STRING "_vtk: Error writing %s\n",
+                   cont->vtufilename);
+    p4est_vtk_context_destroy (cont);
+
+    P4EST_FREE (names);
+
+    return NULL;
+  }
+
+  /* Only have the root write to the parallel vtk file */
+  if (mpirank == 0) {
+    fprintf (cont->pvtufile, "    <PPointData>\n");
+
+    all = 0;
+    for (i = 0; i < num_point_scalars; ++all, i++)
+      fprintf (cont->pvtufile, "      "
+               "<PDataArray type=\"%s\" Name=\"%s\" format=\"%s\"/>\n",
+               P4EST_VTK_FLOAT_NAME, names[all], P4EST_VTK_FORMAT_STRING);
+
+    for (i = 0; i < num_point_vectors; ++all, i++)
+      fprintf (cont->pvtufile, "      "
+               "<PDataArray type=\"%s\" Name=\"%s\" format=\"%s\"/>\n",
+               P4EST_VTK_FLOAT_NAME, names[all], P4EST_VTK_FORMAT_STRING);
+
+    fprintf (cont->pvtufile, "    </PPointData>\n");
+
+    if (ferror (cont->pvtufile)) {
+      P4EST_LERROR (P4EST_STRING "_vtk: Error writing parallel header\n");
+      p4est_vtk_context_destroy (cont);
+
+      P4EST_FREE (names);
+
+      return NULL;
+    }
+  }
+
+  P4EST_FREE (names);
+
+  return cont;
+}
 
-    fprintf (vtufile, "      <CellData Scalars=\"%s\">\n", vtkCellDataString);
+p4est_vtk_context_t *
+p4est_vtk_write_point_dataf (p4est_vtk_context_t * cont,
+                             int num_point_scalars, int num_point_vectors,
+                             ...)
+{
+  va_list             ap;
+
+  P4EST_ASSERT (cont != NULL && cont->writing);
+  P4EST_ASSERT (num_point_scalars >= 0 && num_point_vectors >= 0);
+
+  va_start (ap, num_point_vectors);
+  cont = p4est_vtk_write_point_datav (cont, num_point_scalars,
+                                      num_point_vectors, ap);
+  va_end (ap);
+
+  return cont;
+}
+
+/** Write VTK cell data.
+ *
+ * This function exports custom cell data to the vtk file; it is functionally
+ * the same as \b p4est_vtk_write_cell_dataf with the only difference being
+ * that instead of a variable argument list, an initialized \a va_list is
+ * passed as the last argument. The \a va_list is initialized from the variable
+ * argument list of the calling function.
+ *
+ * \note This function is actually called from \b p4est_vtk_write_cell_dataf
+ * and does all of the work.
+ *
+ * \param [in,out] cont    A vtk context created by \ref p4est_vtk_context_new.
+ * \param [in] num_point_scalars Number of point scalar datasets to output.
+ * \param [in] num_point_vectors Number of point vector datasets to output.
+ * \param [in,out] ap      An initialized va_list used to access the
+ *                         scalar/vector data.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+static p4est_vtk_context_t *
+p4est_vtk_write_cell_datav (p4est_vtk_context_t * cont,
+                            int write_tree, int write_level,
+                            int write_rank, int wrap_rank,
+                            int num_cell_scalars,
+                            int num_cell_vectors, va_list ap)
+{
+  /* This function needs to do nothing if there is no data. */
+  if (!
+      (write_tree || write_level || write_rank || wrap_rank
+       || num_cell_vectors || num_cell_vectors))
+    return cont;
+
+  const int           mpirank = cont->p4est->mpirank;
+  int                 retval;
+  int                 i, all = 0;
+  int                 scalar_strlen, vector_strlen;
+  sc_array_t         *trees = cont->p4est->trees;
+  p4est_tree_t       *tree;
+  const p4est_topidx_t first_local_tree = cont->p4est->first_local_tree;
+  const p4est_topidx_t last_local_tree = cont->p4est->last_local_tree;
+  const p4est_locidx_t Ncells = cont->p4est->local_num_quadrants;
+  char                cell_scalars[BUFSIZ], cell_vectors[BUFSIZ];
+  const char         *name, **names;
+  sc_array_t        **values;
+  size_t              num_quads, zz;
+  sc_array_t         *quadrants;
+  p4est_quadrant_t   *quad;
+#ifdef P4EST_VTK_ASCII
+  p4est_locidx_t      sk;
+#else
+  uint8_t            *uint8_data;
+  p4est_locidx_t     *locidx_data;
+#endif
+  p4est_topidx_t      jt;
+  p4est_locidx_t      il;
+
+  P4EST_ASSERT (cont != NULL && cont->writing);
+  P4EST_ASSERT (wrap_rank >= 0);
+
+  values = P4EST_ALLOC (sc_array_t *, num_cell_scalars + num_cell_vectors);
+  names = P4EST_ALLOC (const char *, num_cell_scalars + num_cell_vectors);
+
+  /* Gather cell data. */
+  scalar_strlen = 0;
+  cell_scalars[0] = '\0';
+  for (i = 0; i < num_cell_scalars; ++all, ++i) {
+    name = names[all] = va_arg (ap, const char *);
+    retval = snprintf (cell_scalars + scalar_strlen, BUFSIZ - scalar_strlen,
+                       "%s%s", i == 0 ? "" : ",", name);
+    SC_CHECK_ABORT (retval > 0,
+                    P4EST_STRING "_vtk: Error collecting cell scalars");
+    scalar_strlen += retval;
+    values[all] = va_arg (ap, sc_array_t *);
+
+    /* Validate input. */
+    SC_CHECK_ABORT (values[all]->elem_size == sizeof (double),
+                    P4EST_STRING
+                    "_vtk: Error: incorrect cell scalar data type; scalar data must contain doubles.");
+    SC_CHECK_ABORT (values[all]->elem_count ==
+                    (size_t) cont->p4est->local_num_quadrants,
+                    P4EST_STRING
+                    "_vtk: Error: incorrect cell scalar data count; scalar data must contain exactly p4est->local_num_quadrants doubles.");
+  }
+
+  vector_strlen = 0;
+  cell_vectors[0] = '\0';
+  for (i = 0; i < num_cell_vectors; ++all, ++i) {
+    name = names[all] = va_arg (ap, const char *);
+    retval = snprintf (cell_vectors + vector_strlen, BUFSIZ - vector_strlen,
+                       "%s%s", i == 0 ? "" : ",", name);
+    SC_CHECK_ABORT (retval > 0,
+                    P4EST_STRING "_vtk: Error collecting cell vectors");
+    vector_strlen += retval;
+    values[all] = va_arg (ap, sc_array_t *);
+
+    /* Validate input. */
+    SC_CHECK_ABORT (values[all]->elem_size == sizeof (double),
+                    P4EST_STRING
+                    "_vtk: Error: incorrect cell vector data type; vector data must contain doubles.");
+    SC_CHECK_ABORT (values[all]->elem_count ==
+                    3 * (size_t) cont->p4est->local_num_quadrants,
+                    P4EST_STRING
+                    "_vtk: Error: incorrect cell vector data count; vector data must contain exactly 3*p4est->local_num_quadrants doubles.");
   }
 
+  /* Check for pointer variable marking the end of variable data input. */
+  p4est_vtk_context_t *end = va_arg (ap, p4est_vtk_context_t *);
+  SC_CHECK_ABORT (end == cont, P4EST_STRING "_vtk Error: the end of variable "
+                  "data must be specified by passing, as the last argument, the current "
+                  P4EST_STRING "_vtk_context_t struct. See " P4EST_STRING
+                  "_vtk.h for more information.");
+
+  char                vtkCellDataString[BUFSIZ] = "";
+  int                 printed = 0;
+
+  if (write_tree)
+    printed +=
+      snprintf (vtkCellDataString + printed, BUFSIZ - printed, "treeid");
+
+  if (write_level)
+    printed +=
+      snprintf (vtkCellDataString + printed, BUFSIZ - printed,
+                printed > 0 ? ",level" : "level");
+
+  if (write_rank)
+    printed +=
+      snprintf (vtkCellDataString + printed, BUFSIZ - printed,
+                printed > 0 ? ",mpirank" : "mpirank");
+
+  if (num_cell_scalars)
+    printed +=
+      snprintf (vtkCellDataString + printed, BUFSIZ - printed,
+                printed > 0 ? ",%s" : "%s", cell_scalars);
+
+  if (num_cell_vectors)
+    printed +=
+      snprintf (vtkCellDataString + printed, BUFSIZ - printed,
+                printed > 0 ? ",%s" : "%s", cell_vectors);
+
+  fprintf (cont->vtufile, "      <CellData Scalars=\"%s\">\n",
+           vtkCellDataString);
+
+#ifndef P4EST_VTK_ASCII
+  locidx_data = P4EST_ALLOC (p4est_locidx_t, Ncells);
+  uint8_data = P4EST_ALLOC (uint8_t, Ncells);
+#endif
+
   if (write_tree) {
-    fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"treeid\""
+    fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"treeid\""
              " format=\"%s\">\n", P4EST_VTK_LOCIDX, P4EST_VTK_FORMAT_STRING);
 #ifdef P4EST_VTK_ASCII
-    fprintf (vtufile, "         ");
+    fprintf (cont->vtufile, "         ");
     for (il = 0, sk = 1, jt = first_local_tree; jt <= last_local_tree; ++jt) {
       tree = p4est_tree_array_index (trees, jt);
       num_quads = tree->quadrants.elem_count;
       for (zz = 0; zz < num_quads; ++zz, ++sk, ++il) {
-        fprintf (vtufile, " %lld", (long long) jt);
+        fprintf (cont->vtufile, " %lld", (long long) jt);
         if (!(sk % 20) && il != (Ncells - 1))
-          fprintf (vtufile, "\n         ");
+          fprintf (cont->vtufile, "\n         ");
       }
     }
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
 #else
     for (il = 0, jt = first_local_tree; jt <= last_local_tree; ++jt) {
       tree = p4est_tree_array_index (trees, jt);
@@ -533,37 +1181,43 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
         locidx_data[il] = (p4est_locidx_t) jt;
       }
     }
-    fprintf (vtufile, "          ");
-    retval = p4est_vtk_write_binary (vtufile, (char *) locidx_data,
+    fprintf (cont->vtufile, "          ");
+    retval = p4est_vtk_write_binary (cont->vtufile, (char *) locidx_data,
                                      sizeof (*locidx_data) * Ncells);
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
     if (retval) {
       P4EST_LERROR (P4EST_STRING "_vtk: Error encoding types\n");
-      fclose (vtufile);
-      return -1;
+      p4est_vtk_context_destroy (cont);
+
+      P4EST_FREE (values);
+      P4EST_FREE (names);
+      P4EST_FREE (locidx_data);
+      P4EST_FREE (uint8_data);
+
+      return NULL;
     }
 #endif
-    fprintf (vtufile, "        </DataArray>\n");
+    fprintf (cont->vtufile, "        </DataArray>\n");
     P4EST_ASSERT (il == Ncells);
   }
 
   if (write_level) {
-    fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"level\""
+    fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"level\""
              " format=\"%s\">\n", "UInt8", P4EST_VTK_FORMAT_STRING);
 #ifdef P4EST_VTK_ASCII
-    fprintf (vtufile, "         ");
+    fprintf (cont->vtufile, "         ");
     for (il = 0, sk = 1, jt = first_local_tree; jt <= last_local_tree; ++jt) {
       tree = p4est_tree_array_index (trees, jt);
       quadrants = &tree->quadrants;
       num_quads = quadrants->elem_count;
       for (zz = 0; zz < num_quads; ++zz, ++sk, ++il) {
         quad = p4est_quadrant_array_index (quadrants, zz);
-        fprintf (vtufile, " %d", (int) quad->level);
+        fprintf (cont->vtufile, " %d", (int) quad->level);
         if (!(sk % 20) && il != (Ncells - 1))
-          fprintf (vtufile, "\n         ");
+          fprintf (cont->vtufile, "\n         ");
       }
     }
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
 #else
     for (il = 0, jt = first_local_tree; jt <= last_local_tree; ++jt) {
       tree = p4est_tree_array_index (trees, jt);
@@ -575,375 +1229,390 @@ p4est_vtk_write_header (p4est_t * p4est, p4est_geometry_t * geom,
       }
     }
 
-    fprintf (vtufile, "          ");
-    retval = p4est_vtk_write_binary (vtufile, (char *) uint8_data,
+    fprintf (cont->vtufile, "          ");
+    retval = p4est_vtk_write_binary (cont->vtufile, (char *) uint8_data,
                                      sizeof (*uint8_data) * Ncells);
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
+
+    P4EST_FREE (uint8_data);
+
     if (retval) {
       P4EST_LERROR (P4EST_STRING "_vtk: Error encoding types\n");
-      fclose (vtufile);
-      return -1;
+      p4est_vtk_context_destroy (cont);
+
+      P4EST_FREE (values);
+      P4EST_FREE (names);
+      P4EST_FREE (locidx_data);
+
+      return NULL;
     }
 #endif
-    fprintf (vtufile, "        </DataArray>\n");
+    fprintf (cont->vtufile, "        </DataArray>\n");
   }
 
   if (write_rank) {
     const int           wrapped_rank =
       wrap_rank > 0 ? mpirank % wrap_rank : mpirank;
 
-    fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"mpirank\""
+    fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"mpirank\""
              " format=\"%s\">\n", P4EST_VTK_LOCIDX, P4EST_VTK_FORMAT_STRING);
 #ifdef P4EST_VTK_ASCII
-    fprintf (vtufile, "         ");
+    fprintf (cont->vtufile, "         ");
     for (il = 0, sk = 1; il < Ncells; ++il, ++sk) {
-      fprintf (vtufile, " %d", wrapped_rank);
+      fprintf (cont->vtufile, " %d", wrapped_rank);
       if (!(sk % 20) && il != (Ncells - 1))
-        fprintf (vtufile, "\n         ");
+        fprintf (cont->vtufile, "\n         ");
     }
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
 #else
     for (il = 0; il < Ncells; ++il)
       locidx_data[il] = (p4est_locidx_t) wrapped_rank;
 
-    fprintf (vtufile, "          ");
-    retval = p4est_vtk_write_binary (vtufile, (char *) locidx_data,
+    fprintf (cont->vtufile, "          ");
+    retval = p4est_vtk_write_binary (cont->vtufile, (char *) locidx_data,
                                      sizeof (*locidx_data) * Ncells);
-    fprintf (vtufile, "\n");
+    fprintf (cont->vtufile, "\n");
+
+    P4EST_FREE (locidx_data);
+
     if (retval) {
       P4EST_LERROR (P4EST_STRING "_vtk: Error encoding types\n");
-      fclose (vtufile);
-      return -1;
+      p4est_vtk_context_destroy (cont);
+
+      P4EST_FREE (values);
+      P4EST_FREE (names);
+
+      return NULL;
     }
 #endif
-    fprintf (vtufile, "        </DataArray>\n");
+    fprintf (cont->vtufile, "        </DataArray>\n");
   }
 
-  if (write_tree || write_level || write_rank) {
-    fprintf (vtufile, "      </CellData>\n");
-  }
+  if (ferror (cont->vtufile)) {
+    P4EST_LERRORF (P4EST_STRING "_vtk: Error writing %s\n",
+                   cont->vtufilename);
+    p4est_vtk_context_destroy (cont);
 
-#ifndef P4EST_VTK_ASCII
-  P4EST_FREE (locidx_data);
-  P4EST_FREE (uint8_data);
-#endif
+    P4EST_FREE (values);
+    P4EST_FREE (names);
 
-  if (nodes != NULL) {
-    p4est_nodes_destroy (nodes);
+    return NULL;
   }
 
-  fprintf (vtufile, "      <PointData");
-  if (point_scalars != NULL)
-    fprintf (vtufile, " Scalars=\"%s\"", point_scalars);
-  if (point_vectors != NULL)
-    fprintf (vtufile, " Vectors=\"%s\"", point_vectors);
-  fprintf (vtufile, ">\n");
-
-  if (ferror (vtufile)) {
-    P4EST_LERROR (P4EST_STRING "_vtk: Error writing header\n");
-    fclose (vtufile);
-    return -1;
+  all = 0;
+  for (i = 0; i < num_cell_scalars; ++all, ++i) {
+    cont = p4est_vtk_write_cell_scalar (cont, names[all], values[all]);
+    SC_CHECK_ABORT (cont != NULL,
+                    P4EST_STRING "_vtk: Error writing cell scalars");
   }
-  if (fclose (vtufile)) {
-    P4EST_LERROR (P4EST_STRING "_vtk: Error closing header\n");
-    return -1;
+
+  for (i = 0; i < num_cell_vectors; ++all, ++i) {
+    cont = p4est_vtk_write_cell_vector (cont, names[all], values[all]);
+    SC_CHECK_ABORT (cont != NULL,
+                    P4EST_STRING "_vtk: Error writing cell vectors");
   }
-  vtufile = NULL;
 
-  /* Only have the root write to the parallel vtk file */
-  if (mpirank == 0) {
-    char                pvtufilename[BUFSIZ];
-    FILE               *pvtufile;
+  fprintf (cont->vtufile, "      </CellData>\n");
 
-    snprintf (pvtufilename, BUFSIZ, "%s.pvtu", filename);
+  P4EST_FREE (values);
 
-    pvtufile = fopen (pvtufilename, "wb");
-    if (!pvtufile) {
-      P4EST_LERRORF ("Could not open %s for output\n", vtufilename);
-      return -1;
-    }
+  if (ferror (cont->vtufile)) {
+    P4EST_LERRORF (P4EST_STRING "_vtk: Error writing %s\n",
+                   cont->vtufilename);
+    p4est_vtk_context_destroy (cont);
 
-    fprintf (pvtufile, "<?xml version=\"1.0\"?>\n");
-    fprintf (pvtufile, "<VTKFile type=\"PUnstructuredGrid\" version=\"0.1\"");
-#if defined P4EST_VTK_BINARY && defined P4EST_VTK_COMPRESSION
-    fprintf (pvtufile, " compressor=\"vtkZLibDataCompressor\"");
-#endif
-#ifdef SC_IS_BIGENDIAN
-    fprintf (pvtufile, " byte_order=\"BigEndian\">\n");
-#else
-    fprintf (pvtufile, " byte_order=\"LittleEndian\">\n");
-#endif
+    P4EST_FREE (names);
 
-    fprintf (pvtufile, "  <PUnstructuredGrid GhostLevel=\"0\">\n");
-    fprintf (pvtufile, "    <PPoints>\n");
-    fprintf (pvtufile, "      <PDataArray type=\"%s\" Name=\"Position\""
-             " NumberOfComponents=\"3\" format=\"%s\"/>\n",
-             P4EST_VTK_FLOAT_NAME, P4EST_VTK_FORMAT_STRING);
-    fprintf (pvtufile, "    </PPoints>\n");
-    if (write_tree || write_level || write_rank) {
-      char                vtkCellDataString[BUFSIZ] = "";
-      int                 printed = 0;
-
-      if (write_tree)
-        printed +=
-          snprintf (vtkCellDataString + printed, BUFSIZ - printed, "treeid");
-      if (write_level)
-        printed +=
-          snprintf (vtkCellDataString + printed, BUFSIZ - printed,
-                    printed > 0 ? ",level" : "level");
-      if (write_rank)
-        printed +=
-          snprintf (vtkCellDataString + printed, BUFSIZ - printed,
-                    printed > 0 ? ",mpirank" : "mpirank");
-
-      fprintf (pvtufile, "    <PCellData Scalars=\"%s\">\n",
-               vtkCellDataString);
-    }
-    if (write_tree) {
-      fprintf (pvtufile, "      "
+    return NULL;
+  }
+
+  /* Only have the root write to the parallel vtk file */
+  if (mpirank == 0) {
+    fprintf (cont->pvtufile, "    <PCellData Scalars=\"%s\">\n",
+             vtkCellDataString);
+
+    if (write_tree)
+      fprintf (cont->pvtufile, "      "
                "<PDataArray type=\"%s\" Name=\"treeid\" format=\"%s\"/>\n",
                P4EST_VTK_LOCIDX, P4EST_VTK_FORMAT_STRING);
-    }
-    if (write_level) {
-      fprintf (pvtufile, "      "
+
+    if (write_level)
+      fprintf (cont->pvtufile, "      "
                "<PDataArray type=\"%s\" Name=\"level\" format=\"%s\"/>\n",
                "UInt8", P4EST_VTK_FORMAT_STRING);
-    }
-    if (write_rank) {
-      fprintf (pvtufile, "      "
+
+    if (write_rank)
+      fprintf (cont->pvtufile, "      "
                "<PDataArray type=\"%s\" Name=\"mpirank\" format=\"%s\"/>\n",
                P4EST_VTK_LOCIDX, P4EST_VTK_FORMAT_STRING);
-    }
-    if (write_tree || write_level || write_rank) {
-      fprintf (pvtufile, "    </PCellData>\n");
-    }
-    fprintf (pvtufile, "    <PPointData>\n");
 
-    if (ferror (pvtufile)) {
+    all = 0;
+    for (i = 0; i < num_cell_scalars; ++all, i++)
+      fprintf (cont->pvtufile, "      "
+               "<PDataArray type=\"%s\" Name=\"%s\" format=\"%s\"/>\n",
+               P4EST_VTK_FLOAT_NAME, names[all], P4EST_VTK_FORMAT_STRING);
+
+    for (i = 0; i < num_cell_vectors; ++all, i++)
+      fprintf (cont->pvtufile, "      "
+               "<PDataArray type=\"%s\" Name=\"%s\" format=\"%s\"/>\n",
+               P4EST_VTK_FLOAT_NAME, names[all], P4EST_VTK_FORMAT_STRING);
+
+    fprintf (cont->pvtufile, "    </PCellData>\n");
+
+    if (ferror (cont->pvtufile)) {
       P4EST_LERROR (P4EST_STRING "_vtk: Error writing parallel header\n");
-      fclose (pvtufile);
-      return -1;
-    }
-    if (fclose (pvtufile)) {
-      P4EST_LERROR (P4EST_STRING "_vtk: Error closing parallel header\n");
-      return -1;
+      p4est_vtk_context_destroy (cont);
+
+      P4EST_FREE (names);
+
+      return NULL;
     }
   }
 
-  return 0;
+  P4EST_FREE (names);
+
+  return cont;
 }
 
-int
-p4est_vtk_write_point_scalar (p4est_t * p4est, p4est_geometry_t * geom,
-                              const char *filename,
-                              const char *scalar_name, const double *values)
+p4est_vtk_context_t *
+p4est_vtk_write_cell_dataf (p4est_vtk_context_t * cont,
+                            int write_tree, int write_level,
+                            int write_rank, int wrap_rank,
+                            int num_cell_scalars, int num_cell_vectors, ...)
 {
-  const int           mpirank = p4est->mpirank;
-  const p4est_locidx_t Ncells = p4est->local_num_quadrants;
-  const p4est_locidx_t Ncorners = P4EST_CHILDREN * Ncells;      /* type ok */
-  int                 retval;
-  p4est_locidx_t      il;
+  va_list             ap;
+
+  P4EST_ASSERT (cont != NULL && cont->writing);
+  P4EST_ASSERT (num_cell_scalars >= 0 && num_cell_vectors >= 0);
+
+  va_start (ap, num_cell_vectors);
+  cont = p4est_vtk_write_cell_datav (cont,
+                                     write_tree, write_level,
+                                     write_rank, wrap_rank,
+                                     num_cell_scalars, num_cell_vectors, ap);
+  va_end (ap);
+
+  return cont;
+}
+
+p4est_vtk_context_t *
+p4est_vtk_write_point_scalar (p4est_vtk_context_t * cont,
+                              const char *scalar_name, sc_array_t * values)
+{
+  p4est_locidx_t      il, ddl;
+  int                 use_nodes;
+#ifdef P4EST_ENABLE_DEBUG
+  int                 Ncorners;
+#endif
+  int                 Npoints;
 #ifndef P4EST_VTK_ASCII
+  int                 retval;
   P4EST_VTK_FLOAT_TYPE *float_data;
 #endif
-  char                vtufilename[BUFSIZ];
-  FILE               *vtufile;
+  p4est_locidx_t     *ntc;
 
-  /* Have each proc write to its own file */
-  snprintf (vtufilename, BUFSIZ, "%s_%04d.vtu", filename, mpirank);
-  /* To be able to fseek in a file you cannot open in append mode.
-   * so you need to open with "r+" and fseek to SEEK_END.
-   */
-  vtufile = fopen (vtufilename, "rb+");
-  if (vtufile == NULL) {
-    P4EST_LERRORF ("Could not open %s for output\n", vtufilename);
-    return -1;
+  P4EST_ASSERT (cont != NULL && cont->writing);
+#ifdef P4EST_ENABLE_DEBUG
+  Ncorners = cont->num_corners;
+#endif
+  Npoints = cont->num_points;
+  ntc = cont->node_to_corner;
+  P4EST_ASSERT (values != NULL && values->elem_count == (size_t) Ncorners);
+  if (ntc == NULL) {
+    /* we are writing a discontinuous field, possibly due to vertex scaling */
+    P4EST_ASSERT (cont->num_corners == cont->num_points);
+    P4EST_ASSERT (cont->scale < 1. || !cont->continuous);
+    use_nodes = 0;
   }
-  retval = fseek (vtufile, 0L, SEEK_END);
-  if (retval) {
-    P4EST_LERRORF ("Could not fseek %s for output\n", vtufilename);
-    fclose (vtufile);
-    return -1;
+  else {
+    /* we are definitely writing a continuous field, reusing corner values */
+    P4EST_ASSERT (cont->scale == 1. && cont->continuous);
+    use_nodes = 1;
   }
 
-  /* write point position data */
-  fprintf (vtufile, "        <DataArray type=\"%s\" Name=\"%s\""
+  /* write point data */
+  fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"%s\""
            " format=\"%s\">\n",
            P4EST_VTK_FLOAT_NAME, scalar_name, P4EST_VTK_FORMAT_STRING);
 
 #ifdef P4EST_VTK_ASCII
-  for (il = 0; il < Ncorners; ++il) {
+  for (il = 0; il < Npoints; ++il) {
+    ddl = use_nodes ? ntc[il] : il;
+    P4EST_ASSERT (0 <= ddl && ddl < Ncorners);
+
+    fprintf (cont->vtufile,
 #ifdef P4EST_VTK_DOUBLES
-    fprintf (vtufile, "     %24.16e\n", values[il]);
+             "     %24.16e\n",
 #else
-    fprintf (vtufile, "          %16.8e\n", values[il]);
+             "          %16.8e\n",
 #endif
+             *(double *) sc_array_index (values, ddl));
   }
 #else
-  float_data = P4EST_ALLOC (P4EST_VTK_FLOAT_TYPE, Ncorners);
-  for (il = 0; il < Ncorners; ++il) {
-    float_data[il] = (P4EST_VTK_FLOAT_TYPE) values[il];
+  float_data = P4EST_ALLOC (P4EST_VTK_FLOAT_TYPE, Npoints);
+  for (il = 0; il < Npoints; ++il) {
+    ddl = use_nodes ? ntc[il] : il;
+    P4EST_ASSERT (0 <= ddl && ddl < Ncorners);
+    float_data[il] =
+      (P4EST_VTK_FLOAT_TYPE) * ((double *) sc_array_index (values, ddl));
   }
 
-  fprintf (vtufile, "          ");
+  fprintf (cont->vtufile, "          ");
   /* TODO: Don't allocate the full size of the array, only allocate
    * the chunk that will be passed to zlib and do this a chunk
    * at a time.
    */
-  retval = p4est_vtk_write_binary (vtufile, (char *) float_data,
-                                   sizeof (*float_data) * Ncorners);
-  fprintf (vtufile, "\n");
+  retval = p4est_vtk_write_binary (cont->vtufile, (char *) float_data,
+                                   sizeof (*float_data) * Npoints);
+  fprintf (cont->vtufile, "\n");
+
+  P4EST_FREE (float_data);
+
   if (retval) {
     P4EST_LERROR (P4EST_STRING "_vtk: Error encoding points\n");
-    fclose (vtufile);
-    return -1;
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
-  P4EST_FREE (float_data);
 #endif
-  fprintf (vtufile, "        </DataArray>\n");
+  fprintf (cont->vtufile, "        </DataArray>\n");
 
-  if (ferror (vtufile)) {
+  if (ferror (cont->vtufile)) {
     P4EST_LERROR (P4EST_STRING "_vtk: Error writing point scalar\n");
-    fclose (vtufile);
-    return -1;
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
-  if (fclose (vtufile)) {
-    P4EST_LERROR (P4EST_STRING "_vtk: Error closing point scalar\n");
-    return -1;
+
+  return cont;
+}
+
+p4est_vtk_context_t *
+p4est_vtk_write_point_vector (p4est_vtk_context_t * cont,
+                              const char *vector_name, sc_array_t * values)
+{
+  P4EST_ASSERT (cont != NULL && cont->writing);
+
+  SC_ABORT (P4EST_STRING "_vtk_write_point_vector not implemented");
+}
+
+p4est_vtk_context_t *
+p4est_vtk_write_cell_scalar (p4est_vtk_context_t * cont,
+                             const char *scalar_name, sc_array_t * values)
+{
+  const p4est_locidx_t Ncells = cont->p4est->local_num_quadrants;
+  p4est_locidx_t      il;
+#ifndef P4EST_VTK_ASCII
+  int                 retval;
+  P4EST_VTK_FLOAT_TYPE *float_data;
+#endif
+
+  P4EST_ASSERT (cont != NULL && cont->writing);
+
+  /* Write cell data. */
+  fprintf (cont->vtufile, "        <DataArray type=\"%s\" Name=\"%s\""
+           " format=\"%s\">\n",
+           P4EST_VTK_FLOAT_NAME, scalar_name, P4EST_VTK_FORMAT_STRING);
+
+#ifdef P4EST_VTK_ASCII
+  for (il = 0; il < Ncells; ++il) {
+    fprintf (cont->vtufile,
+#ifdef P4EST_VTK_DOUBLES
+             "     %24.16e\n",
+#else
+             "          %16.8e\n",
+#endif
+             *(double *) sc_array_index (values, il));
+  }
+#else
+  float_data = P4EST_ALLOC (P4EST_VTK_FLOAT_TYPE, Ncells);
+  for (il = 0; il < Ncells; ++il) {
+    float_data[il] =
+      (P4EST_VTK_FLOAT_TYPE) * ((double *) sc_array_index (values, il));
   }
-  vtufile = NULL;
 
-  /* Only have the root write to the parallel vtk file */
-  if (mpirank == 0) {
-    char                pvtufilename[BUFSIZ];
-    FILE               *pvtufile;
-    snprintf (pvtufilename, BUFSIZ, "%s.pvtu", filename);
+  fprintf (cont->vtufile, "          ");
+  /* TODO: Don't allocate the full size of the array, only allocate
+   * the chunk that will be passed to zlib and do this a chunk
+   * at a time.
+   */
+  retval = p4est_vtk_write_binary (cont->vtufile, (char *) float_data,
+                                   sizeof (*float_data) * Ncells);
+  fprintf (cont->vtufile, "\n");
 
-    pvtufile = fopen (pvtufilename, "ab");
-    if (!pvtufile) {
-      P4EST_LERRORF ("Could not open %s for output\n", vtufilename);
-      return -1;
-    }
+  P4EST_FREE (float_data);
 
-    fprintf (pvtufile, "      <PDataArray type=\"%s\" Name=\"%s\""
-             " format=\"%s\"/>\n",
-             P4EST_VTK_FLOAT_NAME, scalar_name, P4EST_VTK_FORMAT_STRING);
+  if (retval) {
+    P4EST_LERROR (P4EST_STRING "_vtk: Error encoding scalar cell data\n");
+    p4est_vtk_context_destroy (cont);
+    return NULL;
+  }
+#endif
+  fprintf (cont->vtufile, "        </DataArray>\n");
 
-    if (ferror (pvtufile)) {
-      P4EST_LERROR (P4EST_STRING
-                    "_vtk: Error writing parallel point scalar\n");
-      fclose (pvtufile);
-      return -1;
-    }
-    if (fclose (pvtufile)) {
-      P4EST_LERROR (P4EST_STRING
-                    "_vtk: Error closing parallel point scalar\n");
-      return -1;
-    }
+  if (ferror (cont->vtufile)) {
+    P4EST_LERROR (P4EST_STRING "_vtk: Error writing cell scalar file\n");
+    p4est_vtk_context_destroy (cont);
+    return NULL;
   }
 
-  return 0;
+  return cont;
 }
 
-int
-p4est_vtk_write_point_vector (p4est_t * p4est, p4est_geometry_t * geom,
-                              const char *filename,
-                              const char *vector_name, const double *values)
+p4est_vtk_context_t *
+p4est_vtk_write_cell_vector (p4est_vtk_context_t * cont,
+                             const char *vector_name, sc_array_t * values)
 {
-  SC_ABORT (P4EST_STRING "_vtk_write_point_vector not implemented");
+  P4EST_ASSERT (cont != NULL && cont->writing);
+
+  SC_ABORT (P4EST_STRING "_vtk_write_cell_vector not implemented");
 }
 
 int
-p4est_vtk_write_footer (p4est_t * p4est, const char *filename)
+p4est_vtk_write_footer (p4est_vtk_context_t * cont)
 {
-  char                vtufilename[BUFSIZ];
   int                 p;
-  int                 procRank = p4est->mpirank;
-  int                 numProcs = p4est->mpisize;
-  FILE               *vtufile;
+  int                 procRank = cont->p4est->mpirank;
+  int                 numProcs = cont->p4est->mpisize;
 
-  /* Have each proc write to its own file */
-  snprintf (vtufilename, BUFSIZ, "%s_%04d.vtu", filename, procRank);
-  vtufile = fopen (vtufilename, "ab");
-  if (vtufile == NULL) {
-    P4EST_LERRORF ("Could not open %s for output!\n", vtufilename);
-    return -1;
-  }
+  P4EST_ASSERT (cont != NULL && cont->writing);
 
-  fprintf (vtufile, "      </PointData>\n");
-  fprintf (vtufile, "    </Piece>\n");
-  fprintf (vtufile, "  </UnstructuredGrid>\n");
-  fprintf (vtufile, "</VTKFile>\n");
+  fprintf (cont->vtufile, "    </Piece>\n");
+  fprintf (cont->vtufile, "  </UnstructuredGrid>\n");
+  fprintf (cont->vtufile, "</VTKFile>\n");
 
-  if (ferror (vtufile)) {
-    P4EST_LERROR ("p4est_vtk: Error writing footer\n");
-    fclose (vtufile);
+  if (ferror (cont->vtufile)) {
+    P4EST_LERROR (P4EST_STRING "_vtk: Error writing footer\n");
+    p4est_vtk_context_destroy (cont);
     return -1;
   }
-  if (fclose (vtufile)) {
-    P4EST_LERROR ("p4est_vtk: Error closing footer\n");
-    return -1;
-  }
-  vtufile = NULL;
 
   /* Only have the root write to the parallel vtk file */
   if (procRank == 0) {
-    char                visitfilename[BUFSIZ];
-    char                pvtufilename[BUFSIZ];
-    FILE               *pvtufile, *visitfile;
-
-    /* Reopen paraview master file for writing bottom half */
-    snprintf (pvtufilename, BUFSIZ, "%s.pvtu", filename);
-    pvtufile = fopen (pvtufilename, "ab");
-    if (!pvtufile) {
-      P4EST_LERRORF ("Could not open %s for output!\n", vtufilename);
-      return -1;
-    }
-
-    /* Create a master file for visualization in Visit */
-    snprintf (visitfilename, BUFSIZ, "%s.visit", filename);
-    visitfile = fopen (visitfilename, "wb");
-    if (!visitfile) {
-      P4EST_LERRORF ("Could not open %s for output\n", visitfilename);
-      fclose (pvtufile);
-      return -1;
-    }
-    fprintf (visitfile, "!NBLOCKS %d\n", numProcs);
+    fprintf (cont->visitfile, "!NBLOCKS %d\n", numProcs);
 
     /* Write data about the parallel pieces into both files */
-    fprintf (pvtufile, "    </PPointData>\n");
     for (p = 0; p < numProcs; ++p) {
-      fprintf (pvtufile,
-               "    <Piece Source=\"%s_%04d.vtu\"/>\n", filename, p);
-      fprintf (visitfile, "%s_%04d.vtu\n", filename, p);
-    }
-    fprintf (pvtufile, "  </PUnstructuredGrid>\n");
-    fprintf (pvtufile, "</VTKFile>\n");
-
-    /* Close paraview master file */
-    if (ferror (pvtufile)) {
-      P4EST_LERROR ("p4est_vtk: Error writing parallel footer\n");
-      fclose (visitfile);
-      fclose (pvtufile);
-      return -1;
-    }
-    if (fclose (pvtufile)) {
-      fclose (visitfile);
-      P4EST_LERROR ("p4est_vtk: Error closing parallel footer\n");
-      return -1;
+      fprintf (cont->pvtufile,
+               "    <Piece Source=\"%s_%04d.vtu\"/>\n", cont->filename, p);
+      fprintf (cont->visitfile, "%s_%04d.vtu\n", cont->filename, p);
     }
+    fprintf (cont->pvtufile, "  </PUnstructuredGrid>\n");
+    fprintf (cont->pvtufile, "</VTKFile>\n");
 
-    /* Close visit master file */
-    if (ferror (visitfile)) {
-      P4EST_LERROR ("p4est_vtk: Error writing parallel footer\n");
-      fclose (visitfile);
+    if (ferror (cont->pvtufile)) {
+      P4EST_LERROR (P4EST_STRING "_vtk: Error writing parallel footer\n");
+      p4est_vtk_context_destroy (cont);
       return -1;
     }
-    if (fclose (visitfile)) {
-      P4EST_LERROR ("p4est_vtk: Error closing parallel footer\n");
+
+    if (ferror (cont->visitfile)) {
+      P4EST_LERROR (P4EST_STRING "_vtk: Error writing parallel footer\n");
+      p4est_vtk_context_destroy (cont);
       return -1;
     }
   }
 
+  /* Destroy context structure. */
+  p4est_vtk_context_destroy (cont);
+
   return 0;
 }
diff --git a/src/p4est_vtk.h b/src/p4est_vtk.h
index f86a72d..8e554ff 100644
--- a/src/p4est_vtk.h
+++ b/src/p4est_vtk.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,7 +24,7 @@
 
 /** \file p4est_vtk.h
  *
- * Routines for printing a forest and associated fields to vtk format.
+ * Routines for printing a forest and associated fields to VTK format.
  *
  * \ingroup p4est
  */
@@ -36,159 +37,231 @@
 
 SC_EXTERN_C_BEGIN;
 
-/** This writes out the p4est in VTK format.
+/** Opaque context type for writing VTK output with multiple function calls.
+ */
+typedef struct p4est_vtk_context p4est_vtk_context_t;
+
+/** Write the p4est in VTK format.
  *
- * This is a convenience function for the special
- * case of writing out the tree id, quadrant level, and MPI rank only.
+ * This is a convenience function for the special case of writing out
+ * the tree id, quadrant level, and MPI rank only.
  * One file is written per MPI rank, and one meta file on rank 0.
+ * The quadrants are scaled to length .95; see \ref p4est_vtk_write_header.
  * This function will abort if there is a file error.
  *
  * \param [in] p4est    The p4est to be written.
- * \param [in] geom     A p4est_geometry_t structure or NULL for vertex space.
+ * \param [in] geom     A p4est_geometry_t structure or NULL for vertex space
+ *                      as defined by p4est->connectivity.
  * \param [in] filename The first part of the file name which will have the
  *                      MPI rank appended to it: The output file will be
- *                      filename_rank.vtu, and the meta file filename.pvtu).
+ *                      filename_rank.vtu, and the meta file filename.pvtu.
  */
 void                p4est_vtk_write_file (p4est_t * p4est,
                                           p4est_geometry_t * geom,
                                           const char *filename);
 
-/** This writes out the p4est and any number of point fields in VTK format.
+/** The first call to write a VTK file using individual functions.
  *
- * This is a convenience function that will abort if there is a file error.
+ * Writing a VTK file is split into multiple functions that keep a context.
+ * This is the first function that allocates the opaque context structure.
+ * After allocation, further parameters can be set for the context.
+ * Then, the header, possible data fields, and the footer must be written.
+ * The process can be aborted any time by destroying the context.  In this
+ * case, open files are closed cleanly with only partially written content.
  *
- * \param [in] p4est    The p4est to be written.
- * \param [in] geom     A p4est_geometry_t structure or NULL for vertex space.
- * \param [in] scale    Double value between 0 and 1 to scale each quadrant.
- * \param [in] write_tree   Include the tree id as output field.
- * \param [in] write_level  Include the tree levels as output field.
- * \param [in] write_rank   Include the MPI rank as output field.
- * \param [in] wrap_tree    The MPI rank is written module wrap_tree, or 0.
- * \param filename      First part of the name, see p4est_vtk_write_file.
- * \param num_scalars   Number of scalar fields to write.
- * \param num_vectors   Number of vector fields to write.
- *
- * The variable arguments need to be pairs of (fieldname, fieldvalues)
- * where the scalars come first, then the vectors.
+ * \param p4est     The p4est to be written.
+ *                  If no geometry is specified in
+ *                  \ref p4est_vtk_context_set_geom, we require
+ *                  \b p4est->connectivity to have valid vertex arrays.
+ * \param filename  The first part of the name which will have the processor
+ *                  number appended to it (i.e., the output file will be
+ *                  filename_rank.vtu).  The parallel meta-files for Paraview
+ *                  and Visit use this basename too.
+ *                  We copy this filename to internal storage, so it is not
+ *                  needed to remain alive after calling this function.
+ * \return          A VTK context fur further use.
+ */
+p4est_vtk_context_t *p4est_vtk_context_new (p4est_t * p4est,
+                                            const char *filename);
+
+/** Modify the geometry transformation registered in the context.
+ * After \ref p4est_vtk_context_new, it is at the default NULL.
+ * \param [in,out] cont         The context is modified.
+ *                              It must not yet have been used to start writing
+ *                              in \ref p4est_vtk_write_header.
+ * \param geom      A \ref p4est_geometry_t structure, or NULL for vertex space.
+ *                  If NULL, \b p4est->connectivity->vertices and
+ *                  \b tree_to_vertex must be non-NULL.
+ */
+void                p4est_vtk_context_set_geom (p4est_vtk_context_t * cont,
+                                                p4est_geometry_t * geom);
+
+/** Modify the context parameter for scaling the quadrants.
+ * After \ref p4est_vtk_context_new, it is at the default 0.95.
+ * \param [in,out] cont         The context is modified.
+ *                              It must not yet have been used to start writing
+ *                              in \ref p4est_vtk_write_header.
+ * \param [in] scale            Scale parameter must be in (0, 1].
  */
-void                p4est_vtk_write_all (p4est_t * p4est,
-                                         p4est_geometry_t * geom,
-                                         double scale,
-                                         int write_tree, int write_level,
-                                         int write_rank, int wrap_rank,
-                                         int num_scalars, int num_vectors,
-                                         const char *filename, ...);
-
-/** This will write the header of the vtu file.
- *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of
+void                p4est_vtk_context_set_scale (p4est_vtk_context_t * cont,
+                                                 double scale);
+
+/** Modify the context parameter for expecting continuous point data.
+ * If set to true, the point data is understood as a continuous field.
+ * In this case, we can significantly reduce the file size when scale == 1.
+ * For discontinuous point data, it should be set to false.
+ * After \ref p4est_vtk_context_new, it is at the default false.
+ * \param [in,out] cont         The context is modified.
+ *                              It must not yet have been used to start writing
+ *                              in \ref p4est_vtk_write_header.
+ * \param [in] continuous       Boolean parameter.
+ */
+void                p4est_vtk_context_set_continuous (p4est_vtk_context_t *
+                                                      cont, int continuous);
+
+/** Cleanly destroy a \ref p4est_vtk_context_t structure.
+ *
+ * This function closes all the file pointers and frees the context.
+ * It can be called even if the VTK output
+ * has only been partially written, the files' content will be incomplete.
+ *
+ * \param[in] context     The VTK file context to be destroyed.
+ */
+void                p4est_vtk_context_destroy (p4est_vtk_context_t * context);
+
+/** Write the VTK header.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of
  * fields.  The calling sequence would be something like
  *
- * \begincode
- * p4est_vtk_write_header(p4est, geom, 1., 1, 1, 0, "output");
- * p4est_vtk_write_point_scalar (...);
- * ...
- * p4est_vtk_write_footer(p4est, "output");
- * \endcode
+ *     vtk_context = p4est_vtk_context_new (p4est, "output");
+ *     p4est_vtk_context_set_* (vtk_context, parameter);
+ *     vtk_context = p4est_vtk_write_header (vtk_context, ...);
+ *     if (vtk_context == NULL) { error; }
+ *     vtk_context = p4est_vtk_write_cell_data (vtk_context, ...);
+ *     if (vtk_context == NULL) { error; }
+ *     vtk_context = p4est_vtk_write_point_data (vtk_context, ...);
+ *     if (vtk_context == NULL) { error; }
+ *     retval = p4est_vtk_write_footer (vtk_context);
+ *     if (retval) { error; }
  *
- * \param p4est     The p4est to be written.
- * \param geom      A p4est_geometry_t structure or NULL for vertex space.
- * \param scale     The relative length factor of the quadrants.
- *                  Use 1.0 to fit quadrants exactly, less to create gaps.
- * \param write_tree    Boolean to determine if the tree id should be output.
- * \param write_level   Boolean to determine if the tree levels should be output.
- * \param write_rank    Boolean to determine if the MPI rank should be output.
- * \param wrap_rank Number to wrap around the rank with a modulo operation.
- *                  Can be 0 for no wrapping.
- * \param point_scalars  Comma-separated list of point scalar fields, or NULL.
- * \param point_vectors  Comma-separated list of point vector fields, or NULL.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ *                         None of the vtk_write functions must have been called.
+ *                         This context is the return value if no error occurs.
  *
- * \return          This returns 0 if no error and -1 if there is an error.
+ * \return          On success, an opaque context (p4est_vtk_context_t) pointer
+ *                  that must be passed to subsequent p4est_vtk calls.  It is
+ *                  required to call \ref p4est_vtk_write_footer eventually with
+ *                  this value.  Returns NULL on error.
  */
-int                 p4est_vtk_write_header (p4est_t * p4est,
-                                            p4est_geometry_t * geom,
-                                            double scale,
-                                            int write_tree, int write_level,
-                                            int write_rank, int wrap_rank,
-                                            const char *point_scalars,
-                                            const char *point_vectors,
-                                            const char *filename);
+p4est_vtk_context_t *p4est_vtk_write_header (p4est_vtk_context_t * cont);
 
-/** This will write a scalar field to the vtu file.
+/** Write VTK cell data.
  *
- * It is good practice to make sure that the scalar field also
- * exists in the comma separated string \a point_scalars passed
- * to \c p4est_vtk_write_header.
+ * There are options to have this function write
+ * the tree id, quadrant level, or MPI rank without explicit input data.
  *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of fields.
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of
+ * fields.
  *
- * \param p4est     The p4est to be written.
- * \param geom      A p4est_geometry_t structure or NULL for vertex space.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
- * \param scalar_name The name of the scalar field.
- * \param values    The point values that will be written.
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ * \param [in] write_tree  Boolean to determine if the tree id should be output.
+ * \param [in] write_level Boolean to determine if the tree levels should be output.
+ * \param [in] write_rank  Boolean to determine if the MPI rank should be output.
+ * \param [in] wrap_rank   Number to wrap around the rank with a modulo operation.
+ *                         Can be 0 for no wrapping.
+ * \param [in] num_cell_scalars Number of cell scalar datasets to output.
+ * \param [in] num_cell_vectors Number of cell vector datasets to output.
  *
- * \return          This returns 0 if no error and -1 if there is an error.
+ * The variable arguments need to be pairs of (fieldname, fieldvalues), followed
+ * by a final argument of the VTK context cont (same as the first argument).
+ * The cell scalar pairs come first, followed by the cell vector pairs, then cont.
+ * Each 'fieldname' argument shall be a char string containing the name of the data
+ * contained in the following 'fieldvalues'. Each of the 'fieldvalues'
+ * arguments shall be an sc_array_t * holding double variables.  The number of
+ * doubles in each sc_array must be exactly \a p4est->local_num_quadrants for
+ * scalar data and \a 3*p4est->local_num_quadrants for vector data.
+ *
+ * \note The current p4est_vtk_context_t structure, \a cont, must be the first
+ * and the last argument
+ * of any call to this function; this argument is used to validate that the
+ * correct number of variable arguments have been provided.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+p4est_vtk_context_t *p4est_vtk_write_cell_dataf (p4est_vtk_context_t * cont,
+                                                 int write_tree,
+                                                 int write_level,
+                                                 int write_rank,
+                                                 int wrap_rank,
+                                                 int num_cell_scalars,
+                                                 int num_cell_vectors, ...);
+
+/** This is an alternate version of the varargs function.
+ * Works exactly the same otherwise.
+ * TODO: implement, also for vectors and point data.
  */
-int                 p4est_vtk_write_point_scalar (p4est_t * p4est,
-                                                  p4est_geometry_t * geom,
-                                                  const char *filename,
-                                                  const char *scalar_name,
-                                                  const double *values);
+p4est_vtk_context_t *p4est_vtk_write_cell_data (p4est_vtk_context_t * cont,
+                                                int write_tree,
+                                                int write_level,
+                                                int write_rank,
+                                                int wrap_rank,
+                                                int num_cell_scalars,
+                                                int num_cell_vectors,
+                                                const char *filenames[],
+                                                sc_array_t * values[]);
 
-/** This will write a 3-vector field to the vtu file.
+/** Write VTK point data.
  *
- * It is good practice to make sure that the vector field also
- * exists in the comma separated string \a point_vectors passed
- * to \c p4est_vtk_write_header.
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of
+ * fields.
  *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of fields.
+ * \param [in,out] cont    A VTK context created by \ref p4est_vtk_context_new.
+ * \param [in] num_point_scalars Number of point scalar datasets to output.
+ * \param [in] num_point_vectors Number of point vector datasets to output.
  *
- * \param p4est     The p4est to be written.
- * \param geom      A p4est_geometry_t structure or NULL for vertex space.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
- * \param vector_name The name of the vector field.
- * \param values    The point values that will be written.
+ * The variable arguments need to be pairs of (fieldname, fieldvalues) where
+ * the point scalar pairs come first, followed by the point vector pairs.  Each
+ * 'fieldname' argument shall be a char string containing the name of the data
+ * contained in the following 'fieldvalues'. Each of the 'fieldvalues'
+ * arguments shall be an sc_array_t * holding double variables. The number of
+ * doubles in each sc_array must be exactly the number of components (1 for
+ * scalar and 3 for vector) times 4 times number of elements.
  *
- * \return          This returns 0 if no error and -1 if there is an error.
+ * \note The current
+ * p4est_vtk_context_t structure, cont, must be the last argument of any call
+ * to this function; this argument is used to validate that the correct number
+ * of variable arguments have been provided.
+ *
+ * \note The number of point scalar data in each
+ * sc_array must be exactly \a P4EST_CHILDREN*local_num_quadrants, and the
+ * number of point vector data must be exactly \a
+ * 3*P4EST_CHILDREN*local_num_quadrants. I.e. there must be data for every
+ * corner of every quadrant in the \a p4est, even if the corner is shared by
+ * multiple quadrants.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
  */
-int                 p4est_vtk_write_point_vector (p4est_t * p4est,
-                                                  p4est_geometry_t * geom,
-                                                  const char *filename,
-                                                  const char *vector_name,
-                                                  const double *values);
+p4est_vtk_context_t *p4est_vtk_write_point_dataf (p4est_vtk_context_t * cont,
+                                                  int num_point_scalars,
+                                                  int num_point_vectors, ...);
 
-/** This will write the footer of the vtu file.
- *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of
- * fields.  To write out two fields the
- * calling sequence would be something like
+/** Write the VTU footer and clean up.
  *
- * \begincode
- * p4est_vtk_write_header(p4est, ..., "output");
- * p4est_vtk_write_footer(p4est, "output");
- * \endcode
+ * Writing a VTK file is split into a few routines.
+ * This function writes the footer information to the VTK file and cleanly
+ * destroys the VTK context.
  *
- * \param p4est     The p4est to be written.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
+ * \param [in] cont Context is deallocated before the function returns.
  *
  * \return          This returns 0 if no error and -1 if there is an error.
  */
-int                 p4est_vtk_write_footer (p4est_t * p4est,
-                                            const char *filename);
+int                 p4est_vtk_write_footer (p4est_vtk_context_t * cont);
 
 SC_EXTERN_C_END;
 
diff --git a/src/p4est_wrap.c b/src/p4est_wrap.c
index d0a8b9b..06acfad 100644
--- a/src/p4est_wrap.c
+++ b/src/p4est_wrap.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -21,12 +23,12 @@
 */
 
 #ifndef P4_TO_P8
+#include <p4est_algorithms.h>
 #include <p4est_bits.h>
-#include <p4est_extended.h>
 #include <p4est_wrap.h>
 #else
+#include <p8est_algorithms.h>
 #include <p8est_bits.h>
-#include <p8est_extended.h>
 #include <p8est_wrap.h>
 #endif
 
@@ -38,6 +40,7 @@ refine_callback (p4est_t * p4est, p4est_topidx_t which_tree,
   const p4est_locidx_t old_counter = pp->inside_counter++;
   const uint8_t       flag = pp->flags[old_counter];
 
+  P4EST_ASSERT (pp->coarsen_delay >= 0);
   P4EST_ASSERT (0 <= old_counter);
   P4EST_ASSERT (0 <= pp->num_replaced
                 && pp->num_replaced <= pp->num_refine_flags);
@@ -47,6 +50,12 @@ refine_callback (p4est_t * p4est, p4est_topidx_t which_tree,
   pp->temp_flags[old_counter + (P4EST_CHILDREN - 1) * pp->num_replaced] =
     flag & ~P4EST_WRAP_REFINE;
 
+  /* increase quadrant's counter of most recent adaptation */
+  /* if refinement actually occurs, it will be reset to zero in all children */
+  if (pp->coarsen_delay && q->p.user_int >= 0) {
+    ++q->p.user_int;
+  }
+
   return flag & P4EST_WRAP_REFINE;
 }
 
@@ -65,9 +74,24 @@ replace_on_refine (p4est_t * p4est, p4est_topidx_t which_tree,
   P4EST_ASSERT (num_outgoing == 1 && num_incoming == P4EST_CHILDREN);
   P4EST_ASSERT (!(flag & (P4EST_WRAP_REFINE | P4EST_WRAP_COARSEN)));
 
+  /* we have set the first flag in the refinement callback, do the others */
   for (k = 1; k < P4EST_CHILDREN; ++k) {
     pp->temp_flags[new_counter + k] = flag;
   }
+
+  /* reset the counter for most recent adaptation */
+  P4EST_ASSERT (pp->coarsen_delay >= 0);
+  if (pp->coarsen_delay) {
+    for (k = 0; k < P4EST_CHILDREN; ++k) {
+      incoming[k]->p.user_int = 0;
+    }
+  }
+
+  /* pass the replaced quadrants to the user-provided function */
+  if (pp->replace_fn != NULL) {
+    pp->replace_fn (p4est, which_tree,
+                    num_outgoing, outgoing, num_incoming, incoming);
+  }
 }
 
 static int
@@ -78,6 +102,8 @@ coarsen_callback (p4est_t * p4est, p4est_topidx_t which_tree,
   const p4est_locidx_t old_counter = pp->inside_counter++;
   int                 k;
 
+  P4EST_ASSERT (pp->coarsen_delay >= 0);
+
   /* are we not coarsening at all, just counting? */
   if (q[1] == NULL) {
     return 0;
@@ -86,6 +112,12 @@ coarsen_callback (p4est_t * p4est, p4est_topidx_t which_tree,
   /* now we are possibly coarsening */
   for (k = 0; k < P4EST_CHILDREN; ++k) {
     if (!(pp->temp_flags[old_counter + k] & P4EST_WRAP_COARSEN)) {
+      /* coarsening flag was not set */
+      return 0;
+    }
+    if (pp->coarsen_delay && q[k]->p.user_int >= 0 &&
+        q[k]->p.user_int <= pp->coarsen_delay) {
+      /* most recent adaptation has been too recent */
       return 0;
     }
   }
@@ -96,36 +128,140 @@ coarsen_callback (p4est_t * p4est, p4est_topidx_t which_tree,
   return 1;
 }
 
+static void
+replace_on_coarsen (p4est_t * p4est, p4est_topidx_t which_tree,
+                    int num_outgoing, p4est_quadrant_t * outgoing[],
+                    int num_incoming, p4est_quadrant_t * incoming[])
+{
+  p4est_wrap_t       *pp = (p4est_wrap_t *) p4est->user_pointer;
+  P4EST_ASSERT (num_incoming == 1 && num_outgoing == P4EST_CHILDREN);
+  P4EST_ASSERT (pp->coarsen_delay > 0);
+
+  /* reset most recent adaptation timer */
+  incoming[0]->p.user_int = pp->coarsen_affect ? 0 : -1;
+
+  /* pass the replaced quadrants to the user-provided function */
+  if (pp->replace_fn != NULL) {
+    pp->replace_fn (p4est, which_tree,
+                    num_outgoing, outgoing, num_incoming, incoming);
+  }
+}
+
+static void
+replace_on_balance (p4est_t * p4est, p4est_topidx_t which_tree,
+                    int num_outgoing, p4est_quadrant_t * outgoing[],
+                    int num_incoming, p4est_quadrant_t * incoming[])
+{
+  p4est_wrap_t       *pp = (p4est_wrap_t *) p4est->user_pointer;
+  int                 k;
+
+  /* this function is called when refinement occurs in balance */
+  P4EST_ASSERT (num_outgoing == 1 && num_incoming == P4EST_CHILDREN);
+  P4EST_ASSERT (pp->coarsen_delay > 0);
+
+  /* negative value means coarsening is allowed next time */
+  for (k = 0; k < P4EST_CHILDREN; ++k) {
+    incoming[k]->p.user_int = -1;
+  }
+
+  /* pass the replaced quadrants to the user-provided function */
+  if (pp->replace_fn != NULL) {
+    pp->replace_fn (p4est, which_tree,
+                    num_outgoing, outgoing, num_incoming, incoming);
+  }
+}
+
 p4est_wrap_t       *
 p4est_wrap_new_conn (sc_MPI_Comm mpicomm, p4est_connectivity_t * conn,
                      int initial_level)
 {
+  return p4est_wrap_new_ext (mpicomm, conn, initial_level,
+                             0, P4EST_CONNECT_FULL, NULL, NULL);
+}
+
+p4est_wrap_t       *
+p4est_wrap_new_p4est (p4est_t * p4est, int hollow, p4est_connect_type_t btype,
+                      p4est_replace_t replace_fn, void *user_pointer)
+{
   p4est_wrap_t       *pp;
 
-  pp = P4EST_ALLOC (p4est_wrap_t, 1);
-  pp->user_pointer = NULL;
+  P4EST_ASSERT (p4est_is_valid (p4est));
+  P4EST_ASSERT (p4est->user_pointer == NULL);
+
+  pp = P4EST_ALLOC_ZERO (p4est_wrap_t, 1);
+
+  pp->hollow = hollow;
+
+  sc_refcount_init (&pp->conn_rc, p4est_package_id);
+  pp->conn = p4est->connectivity;
+  pp->conn_owner = NULL;
 
   pp->p4est_dim = P4EST_DIM;
   pp->p4est_half = P4EST_HALF;
   pp->p4est_faces = P4EST_FACES;
   pp->p4est_children = P4EST_CHILDREN;
-  pp->conn = conn;
-  pp->p4est = p4est_new_ext (mpicomm, pp->conn,
-                             0, initial_level, 1, 0, NULL, NULL);
-  pp->weight_exponent = 0;
-  pp->flags = P4EST_ALLOC_ZERO (uint8_t, pp->p4est->local_num_quadrants);
-  pp->temp_flags = NULL;
-  pp->num_refine_flags = pp->inside_counter = pp->num_replaced = 0;
+  pp->btype = btype;
+  pp->replace_fn = replace_fn;
+  pp->p4est = p4est;
+  pp->weight_exponent = 0;      /* keep this even though using ALLOC_ZERO */
+
+  if (!pp->hollow) {
+    pp->flags = P4EST_ALLOC_ZERO (uint8_t, pp->p4est->local_num_quadrants);
+    pp->ghost = p4est_ghost_new (pp->p4est, pp->btype);
+    pp->mesh = p4est_mesh_new_ext (pp->p4est, pp->ghost, 1, 1, pp->btype);
+  }
 
-  pp->ghost = p4est_ghost_new (pp->p4est, P4EST_CONNECT_FULL);
-  pp->mesh = p4est_mesh_new_ext (pp->p4est, pp->ghost, 1, 1,
-                                 P4EST_CONNECT_FULL);
+  pp->p4est->user_pointer = pp;
+  pp->user_pointer = user_pointer;
 
-  pp->ghost_aux = NULL;
-  pp->mesh_aux = NULL;
-  pp->match_aux = 0;
+  return pp;
+}
+
+p4est_wrap_t       *
+p4est_wrap_new_ext (sc_MPI_Comm mpicomm, p4est_connectivity_t * conn,
+                    int initial_level, int hollow, p4est_connect_type_t btype,
+                    p4est_replace_t replace_fn, void *user_pointer)
+{
+  P4EST_ASSERT (p4est_connectivity_is_valid (conn));
+
+  return p4est_wrap_new_p4est (p4est_new_ext (mpicomm, conn,
+                                              0, initial_level, 1,
+                                              0, NULL, NULL),
+                               hollow, btype, replace_fn, user_pointer);
+}
+
+p4est_wrap_t       *
+p4est_wrap_new_copy (p4est_wrap_t * source, size_t data_size,
+                     p4est_replace_t replace_fn, void *user_pointer)
+{
+  p4est_wrap_t       *pp;
+
+  P4EST_ASSERT (source != NULL);
+
+  pp = P4EST_ALLOC_ZERO (p4est_wrap_t, 1);
+
+  pp->hollow = 1;
+
+  sc_refcount_init_invalid (&pp->conn_rc);
+  pp->conn_owner = (source->conn_owner != NULL ? source->conn_owner : source);
+  pp->conn = pp->conn_owner->conn;
+  sc_refcount_ref (&pp->conn_owner->conn_rc);
+
+  pp->p4est_dim = P4EST_DIM;
+  pp->p4est_half = P4EST_HALF;
+  pp->p4est_faces = P4EST_FACES;
+  pp->p4est_children = P4EST_CHILDREN;
+  pp->btype = source->btype;
+  pp->replace_fn = replace_fn;
+  pp->p4est = p4est_copy (source->p4est, 0);
+  if (data_size > 0) {
+    p4est_reset_data (pp->p4est, data_size, NULL, NULL);
+  }
+
+  pp->weight_exponent = 0;      /* keep this even though using ALLOC_ZERO */
 
   pp->p4est->user_pointer = pp;
+  pp->user_pointer = user_pointer;
 
   return pp;
 }
@@ -215,6 +351,32 @@ p8est_wrap_new_rotwrap (sc_MPI_Comm mpicomm, int initial_level)
 #endif
 
 p4est_wrap_t       *
+p4est_wrap_new_brick (sc_MPI_Comm mpicomm, int bx, int by,
+#ifdef P4_TO_P8
+                      int bz,
+#endif
+                      int px, int py,
+#ifdef P4_TO_P8
+                      int pz,
+#endif
+                      int initial_level)
+{
+  P4EST_ASSERT (bx > 0 && by > 0);
+#ifdef P4_TO_P8
+  P4EST_ASSERT (bz > 0);
+#endif
+  return p4est_wrap_new_conn (mpicomm, p4est_connectivity_new_brick (bx, by,
+#ifdef P4_TO_P8
+                                                                     bz,
+#endif
+                                                                     px, py
+#ifdef P4_TO_P8
+                                                                     , pz
+#endif
+                              ), initial_level);
+}
+
+p4est_wrap_t       *
 p4est_wrap_new_world (int initial_level)
 {
 #ifndef P4_TO_P8
@@ -234,27 +396,119 @@ p4est_wrap_destroy (p4est_wrap_t * pp)
     p4est_ghost_destroy (pp->ghost_aux);
   }
 
-  p4est_mesh_destroy (pp->mesh);
-  p4est_ghost_destroy (pp->ghost);
+  if (!pp->hollow) {
+    p4est_mesh_destroy (pp->mesh);
+    p4est_ghost_destroy (pp->ghost);
+  }
 
   P4EST_FREE (pp->flags);
   P4EST_FREE (pp->temp_flags);
 
   p4est_destroy (pp->p4est);
-  p4est_connectivity_destroy (pp->conn);
+
+  /* safety checks for connectivity ownership */
+  if (pp->conn_owner != NULL) {
+    /* we are a copy of a wrap and have borrowed its connectivity */
+    P4EST_ASSERT (!sc_refcount_is_active (&pp->conn_rc));
+    P4EST_EXECUTE_ASSERT_FALSE (sc_refcount_unref (&pp->conn_owner->conn_rc));
+  }
+  else {
+    /* we are the original wrap that owns the connectivity */
+    P4EST_EXECUTE_ASSERT_TRUE (sc_refcount_unref (&pp->conn_rc));
+    p4est_connectivity_destroy (pp->conn);
+  }
 
   P4EST_FREE (pp);
 }
 
+void
+p4est_wrap_set_hollow (p4est_wrap_t * pp, int hollow)
+{
+  /* Verify consistency */
+  if (!pp->hollow) {
+    P4EST_ASSERT (pp->flags != NULL);
+    P4EST_ASSERT (pp->ghost != NULL);
+    P4EST_ASSERT (pp->mesh != NULL);
+  }
+  else {
+    P4EST_ASSERT (pp->flags == NULL);
+    P4EST_ASSERT (pp->ghost == NULL);
+    P4EST_ASSERT (pp->mesh == NULL);
+  }
+
+  /* Make sure a full wrap is only set to hollow outside of adaptation cycle */
+  P4EST_ASSERT (!pp->match_aux);
+  P4EST_ASSERT (pp->temp_flags == NULL);
+  P4EST_ASSERT (pp->ghost_aux == NULL);
+  P4EST_ASSERT (pp->mesh_aux == NULL);
+
+  /* Do nothing if the status is right */
+  if (hollow == pp->hollow) {
+    return;
+  }
+
+  if (pp->hollow) {
+    /* Allocate the ghost, mesh, and flag members */
+    pp->flags = P4EST_ALLOC_ZERO (uint8_t, pp->p4est->local_num_quadrants);
+    pp->ghost = p4est_ghost_new (pp->p4est, pp->btype);
+    pp->mesh = p4est_mesh_new_ext (pp->p4est, pp->ghost, 1, 1, pp->btype);
+  }
+  else {
+    /* Free and nullify the ghost, mesh, and flag members */
+    p4est_mesh_destroy (pp->mesh);
+    p4est_ghost_destroy (pp->ghost);
+    P4EST_FREE (pp->flags);
+    pp->ghost = NULL;
+    pp->mesh = NULL;
+    pp->flags = NULL;
+  }
+  pp->num_refine_flags = pp->inside_counter = pp->num_replaced = 0;
+  pp->hollow = hollow;
+}
+
+void
+p4est_wrap_set_coarsen_delay (p4est_wrap_t * pp,
+                              int coarsen_delay, int coarsen_affect)
+{
+  size_t              zz;
+  p4est_topidx_t      tt;
+  p4est_t            *p4est;
+  p4est_tree_t       *tree;
+  p4est_quadrant_t   *quadrant;
+  sc_array_t         *tquadrants;
+
+  P4EST_ASSERT (pp != NULL);
+  P4EST_ASSERT (coarsen_delay >= 0);
+
+  pp->coarsen_delay = coarsen_delay;
+  pp->coarsen_affect = coarsen_affect;
+  p4est = pp->p4est;
+  P4EST_ASSERT (p4est->data_size == 0);
+
+  /* initialize delay memory in the quadrants' user field */
+  for (tt = p4est->first_local_tree; tt <= p4est->last_local_tree; ++tt) {
+    tree = p4est_tree_array_index (p4est->trees, tt);
+    tquadrants = &tree->quadrants;
+    for (zz = 0; zz < tquadrants->elem_count; ++zz) {
+      quadrant = p4est_quadrant_array_index (tquadrants, zz);
+      quadrant->p.user_int = 0;
+    }
+  }
+}
+
 p4est_ghost_t      *
 p4est_wrap_get_ghost (p4est_wrap_t * pp)
 {
+  P4EST_ASSERT (!pp->hollow);
+
   return pp->match_aux ? pp->ghost_aux : pp->ghost;
 }
 
 p4est_mesh_t       *
 p4est_wrap_get_mesh (p4est_wrap_t * pp)
 {
+  P4EST_ASSERT (!pp->hollow);
+
   return pp->match_aux ? pp->mesh_aux : pp->mesh;
 }
 
@@ -267,6 +521,7 @@ p4est_wrap_mark_refine (p4est_wrap_t * pp,
   p4est_locidx_t      pos;
   uint8_t             flag;
 
+  P4EST_ASSERT (!pp->hollow);
   P4EST_ASSERT (p4est->first_local_tree <= which_tree);
   P4EST_ASSERT (which_tree <= p4est->last_local_tree);
 
@@ -293,6 +548,7 @@ p4est_wrap_mark_coarsen (p4est_wrap_t * pp,
   p4est_locidx_t      pos;
   uint8_t             flag;
 
+  P4EST_ASSERT (!pp->hollow);
   P4EST_ASSERT (p4est->first_local_tree <= which_tree);
   P4EST_ASSERT (which_tree <= p4est->last_local_tree);
 
@@ -320,6 +576,9 @@ p4est_wrap_adapt (p4est_wrap_t * pp)
   p4est_gloidx_t      global_num;
   p4est_t            *p4est = pp->p4est;
 
+  P4EST_ASSERT (!pp->hollow);
+  P4EST_ASSERT (pp->coarsen_delay >= 0);
+
   P4EST_ASSERT (pp->mesh != NULL);
   P4EST_ASSERT (pp->ghost != NULL);
   P4EST_ASSERT (pp->mesh_aux == NULL);
@@ -353,7 +612,8 @@ p4est_wrap_adapt (p4est_wrap_t * pp)
   local_num = p4est->local_num_quadrants;
 #endif
   global_num = p4est->global_num_quadrants;
-  p4est_coarsen_ext (p4est, 0, 1, coarsen_callback, NULL, NULL);
+  p4est_coarsen_ext (p4est, 0, 1, coarsen_callback, NULL,
+                     pp->coarsen_delay ? replace_on_coarsen : pp->replace_fn);
   P4EST_ASSERT (pp->inside_counter == local_num);
   P4EST_ASSERT (local_num - p4est->local_num_quadrants ==
                 pp->num_replaced * (P4EST_CHILDREN - 1));
@@ -366,12 +626,12 @@ p4est_wrap_adapt (p4est_wrap_t * pp)
   /* Only if refinement and/or coarsening happened do we need to balance */
   if (changed) {
     P4EST_FREE (pp->flags);
-    p4est_balance (p4est, P4EST_CONNECT_FULL, NULL);
+    p4est_balance_ext (p4est, pp->btype, NULL, pp->coarsen_delay ?
+                       replace_on_balance : pp->replace_fn);
     pp->flags = P4EST_ALLOC_ZERO (uint8_t, p4est->local_num_quadrants);
 
-    pp->ghost_aux = p4est_ghost_new (p4est, P4EST_CONNECT_FULL);
-    pp->mesh_aux = p4est_mesh_new_ext (p4est, pp->ghost_aux, 1, 1,
-                                       P4EST_CONNECT_FULL);
+    pp->ghost_aux = p4est_ghost_new (p4est, pp->btype);
+    pp->mesh_aux = p4est_mesh_new_ext (p4est, pp->ghost_aux, 1, 1, pp->btype);
     pp->match_aux = 1;
   }
 #ifdef P4EST_ENABLE_DEBUG
@@ -395,10 +655,66 @@ partition_weight (p4est_t * p4est, p4est_topidx_t which_tree,
   return 1 << ((int) quadrant->level * pp->weight_exponent);
 }
 
+static void
+p4est_wrap_partition_unchanged (p4est_gloidx_t pre_me,
+                                p4est_gloidx_t pre_next,
+                                p4est_gloidx_t post_me,
+                                p4est_gloidx_t post_next,
+                                p4est_locidx_t * unchanged_first,
+                                p4est_locidx_t * unchanged_length,
+                                p4est_locidx_t * unchanged_old_first)
+{
+  p4est_locidx_t      uf, ul, uof;
+  p4est_gloidx_t      unext;
+
+  /* consistency checks */
+  P4EST_ASSERT (0 <= pre_me && pre_me <= pre_next);
+  P4EST_ASSERT (0 <= post_me && post_me <= post_next);
+
+  /* initialize the case that no quadrants stay on this processor */
+  uf = ul = uof = 0;
+
+  /* check whether any quadrants stay at all, and which ones */
+  if (pre_me < post_next && post_me < pre_next) {
+    unext = SC_MIN (pre_next, post_next);
+    if (pre_me <= post_me) {
+      uof = (p4est_locidx_t) (post_me - pre_me);
+      ul = (p4est_locidx_t) (unext - post_me);
+    }
+    else {
+      uf = (p4est_locidx_t) (pre_me - post_me);
+      ul = (p4est_locidx_t) (unext - pre_me);
+    }
+  }
+
+  /* consistency checks */
+  P4EST_ASSERT (uf >= 0 && ul >= 0 && uof >= 0);
+  P4EST_ASSERT ((p4est_gloidx_t) uf + ul <= post_next - post_me);
+  P4EST_ASSERT ((p4est_gloidx_t) uof + ul <= pre_next - pre_me);
+
+  /* assign to output variables */
+  if (unchanged_first != NULL) {
+    *unchanged_first = uf;
+  }
+  if (unchanged_length != NULL) {
+    *unchanged_length = ul;
+  }
+  if (unchanged_old_first != NULL) {
+    *unchanged_old_first = uof;
+  }
+}
+
 int
-p4est_wrap_partition (p4est_wrap_t * pp, int weight_exponent)
+p4est_wrap_partition (p4est_wrap_t * pp, int weight_exponent,
+                      p4est_locidx_t * unchanged_first,
+                      p4est_locidx_t * unchanged_length,
+                      p4est_locidx_t * unchanged_old_first)
 {
   int                 changed;
+  p4est_gloidx_t      pre_me, pre_next;
+  p4est_gloidx_t      post_me, post_next;
+
+  P4EST_ASSERT (!pp->hollow);
 
   P4EST_ASSERT (pp->ghost != NULL);
   P4EST_ASSERT (pp->mesh != NULL);
@@ -410,6 +726,21 @@ p4est_wrap_partition (p4est_wrap_t * pp, int weight_exponent)
   p4est_ghost_destroy (pp->ghost);
   pp->match_aux = 0;
 
+  /* Remember the window onto global quadrant sequence before partition */
+  pre_me = pp->p4est->global_first_quadrant[pp->p4est->mpirank];
+  pre_next = pp->p4est->global_first_quadrant[pp->p4est->mpirank + 1];
+
+  /* Initialize output for the case that the partition does not change */
+  if (unchanged_first != NULL) {
+    *unchanged_first = 0;
+  }
+  if (unchanged_length != NULL) {
+    *unchanged_length = pp->p4est->local_num_quadrants;
+  }
+  if (unchanged_old_first != NULL) {
+    *unchanged_old_first = 0;
+  }
+
   /* In the future the flags could be used to pass partition weights */
   /* We need to lift the restriction on 64 bits for the global weight sum */
   P4EST_ASSERT (weight_exponent == 0 || weight_exponent == 1);
@@ -422,9 +753,22 @@ p4est_wrap_partition (p4est_wrap_t * pp, int weight_exponent)
     P4EST_FREE (pp->flags);
     pp->flags = P4EST_ALLOC_ZERO (uint8_t, pp->p4est->local_num_quadrants);
 
-    pp->ghost = p4est_ghost_new (pp->p4est, P4EST_CONNECT_FULL);
-    pp->mesh = p4est_mesh_new_ext (pp->p4est, pp->ghost, 1, 1,
-                                   P4EST_CONNECT_FULL);
+    pp->ghost = p4est_ghost_new (pp->p4est, pp->btype);
+    pp->mesh = p4est_mesh_new_ext (pp->p4est, pp->ghost, 1, 1, pp->btype);
+
+    /* Query the window onto global quadrant sequence after partition */
+    if (unchanged_first != NULL || unchanged_length != NULL ||
+        unchanged_old_first != NULL) {
+
+      /* compute new windof of local quadrants */
+      post_me = pp->p4est->global_first_quadrant[pp->p4est->mpirank];
+      post_next = pp->p4est->global_first_quadrant[pp->p4est->mpirank + 1];
+
+      /* compute the range of quadrants that have stayed on this processor */
+      p4est_wrap_partition_unchanged (pre_me, pre_next, post_me, post_next,
+                                      unchanged_first, unchanged_length,
+                                      unchanged_old_first);
+    }
   }
   else {
     memset (pp->flags, 0, sizeof (uint8_t) * pp->p4est->local_num_quadrants);
@@ -441,6 +785,8 @@ p4est_wrap_partition (p4est_wrap_t * pp, int weight_exponent)
 void
 p4est_wrap_complete (p4est_wrap_t * pp)
 {
+  P4EST_ASSERT (!pp->hollow);
+
   P4EST_ASSERT (pp->ghost != NULL);
   P4EST_ASSERT (pp->mesh != NULL);
   P4EST_ASSERT (pp->ghost_aux != NULL);
@@ -456,19 +802,20 @@ p4est_wrap_complete (p4est_wrap_t * pp)
 static p4est_wrap_leaf_t *
 p4est_wrap_leaf_info (p4est_wrap_leaf_t * leaf)
 {
-#if 0
 #ifdef P4EST_ENABLE_DEBUG
-  int                 nface;
-  p4est_mesh_t       *mesh = p4est_wrap_get_mesh (leaf->pp);
-#endif
+  p4est_t            *p4est = leaf->pp->p4est;
 #endif
+#if 0
   p4est_quadrant_t    corner;
+#endif
+  p4est_quadrant_t   *mirror;
 
-  leaf->total_quad = leaf->tree->quadrants_offset + leaf->which_quad;
-  leaf->quad = p4est_quadrant_array_index (&leaf->tree->quadrants,
-                                           leaf->which_quad);
+  /* complete information on current quadrant */
+  leaf->local_quad = leaf->tree->quadrants_offset + leaf->which_quad;
+  leaf->quad =
+    p4est_quadrant_array_index (leaf->tquadrants, leaf->which_quad);
 
-  leaf->level = (int) leaf->quad->level;
+#if 0
   p4est_qcoord_to_vertex (leaf->pp->conn, leaf->which_tree,
                           leaf->quad->x, leaf->quad->y,
 #ifdef P4_TO_P8
@@ -482,38 +829,79 @@ p4est_wrap_leaf_info (p4est_wrap_leaf_t * leaf)
                           corner.z,
 #endif
                           leaf->upperright);
+#endif
 
 #if 0
 #ifdef P4EST_ENABLE_DEBUG
   printf ("C: Leaf level %d tree %d tree_leaf %d local_leaf %d\n",
-          leaf->level, leaf->which_tree, leaf->which_quad, leaf->total_quad);
-  for (nface = 0; nface < P4EST_FACES; ++nface) {
-    printf ("C: Leaf face %d neighbor leaf %d\n", nface,
-            mesh->quad_to_quad[P4EST_FACES * leaf->total_quad + nface]);
-  }
+          (int) leaf->quad->level, leaf->which_tree, leaf->which_quad,
+          leaf->local_quad);
 #endif
 #endif
 
+  /* track parallel mirror quadrants */
+  if (leaf->mirrors != NULL) {
+    if (leaf->local_quad == leaf->next_mirror_quadrant) {
+      if (++leaf->nm + 1 < (p4est_locidx_t) leaf->mirrors->elem_count) {
+        mirror = p4est_quadrant_array_index (leaf->mirrors, leaf->nm + 1);
+        leaf->next_mirror_quadrant = mirror->p.piggy3.local_num;
+        P4EST_ASSERT (leaf->next_mirror_quadrant > leaf->local_quad);
+        P4EST_ASSERT (leaf->next_mirror_quadrant <
+                      p4est->local_num_quadrants);
+      }
+      else {
+        leaf->next_mirror_quadrant = -1;
+      }
+      leaf->is_mirror = 1;
+    }
+    else {
+      leaf->is_mirror = 0;
+    }
+  }
+
   return leaf;
 }
 
 p4est_wrap_leaf_t  *
-p4est_wrap_leaf_first (p4est_wrap_t * pp)
+p4est_wrap_leaf_first (p4est_wrap_t * pp, int track_mirrors)
 {
-  p4est_wrap_leaf_t  *leaf;
   p4est_t            *p4est = pp->p4est;
+  p4est_wrap_leaf_t  *leaf;
+  p4est_quadrant_t   *mirror;
 
   if (p4est->local_num_quadrants == 0) {
+    P4EST_ASSERT (p4est->first_local_tree == -1);
+    P4EST_ASSERT (p4est->last_local_tree == -2);
     return NULL;
   }
 
+  /* prepare internal state of the leaf iterator */
   leaf = P4EST_ALLOC (p4est_wrap_leaf_t, 1);
   leaf->pp = pp;
   leaf->which_tree = p4est->first_local_tree;
+  P4EST_ASSERT (leaf->which_tree >= 0);
   leaf->tree = p4est_tree_array_index (p4est->trees, leaf->which_tree);
-  P4EST_ASSERT (leaf->tree->quadrants.elem_size > 0);
+  leaf->tquadrants = &leaf->tree->quadrants;
+  P4EST_ASSERT (leaf->tquadrants->elem_size > 0);
   leaf->which_quad = 0;
 
+  /* initialize mirror tracking if desired */
+  leaf->nm = leaf->next_mirror_quadrant = -1;
+  if (track_mirrors) {
+    leaf->mirrors = &(p4est_wrap_get_ghost (pp))->mirrors;
+    if (leaf->mirrors->elem_count > 0) {
+      mirror = p4est_quadrant_array_index (leaf->mirrors, 0);
+      leaf->next_mirror_quadrant = (int) mirror->p.piggy3.local_num;
+      P4EST_ASSERT (leaf->next_mirror_quadrant >= 0);
+      P4EST_ASSERT (leaf->next_mirror_quadrant < p4est->local_num_quadrants);
+    }
+  }
+  else {
+    leaf->mirrors = NULL;
+    leaf->is_mirror = 0;
+  }
+
+  /* complete leaf and mirror information */
   return p4est_wrap_leaf_info (leaf);
 }
 
@@ -524,14 +912,22 @@ p4est_wrap_leaf_next (p4est_wrap_leaf_t * leaf)
 
   P4EST_ASSERT (leaf != NULL);
 
-  if ((size_t) leaf->which_quad + 1 == leaf->tree->quadrants.elem_count) {
+  if ((size_t) leaf->which_quad + 1 == leaf->tquadrants->elem_count) {
     ++leaf->which_tree;
     if (leaf->which_tree > p4est->last_local_tree) {
+#ifdef P4EST_ENABLE_DEBUG
+      if (leaf->mirrors != NULL) {
+        P4EST_ASSERT (leaf->nm + 1 ==
+                      (p4est_locidx_t) leaf->mirrors->elem_count);
+        P4EST_ASSERT (leaf->next_mirror_quadrant == -1);
+      }
+#endif
       P4EST_FREE (leaf);
       return NULL;
     }
     leaf->tree = p4est_tree_array_index (p4est->trees, leaf->which_tree);
-    P4EST_ASSERT (leaf->tree->quadrants.elem_size > 0);
+    leaf->tquadrants = &leaf->tree->quadrants;
+    P4EST_ASSERT (leaf->tquadrants->elem_size > 0);
     leaf->which_quad = 0;
   }
   else {
diff --git a/src/p4est_wrap.h b/src/p4est_wrap.h
index 1801bdf..7a48351 100644
--- a/src/p4est_wrap.h
+++ b/src/p4est_wrap.h
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -23,7 +25,16 @@
 #ifndef P4EST_WRAP_H
 #define P4EST_WRAP_H
 
+/** \file p4est_wrap.h
+ * The logic in p4est_wrap encapsulates core p4est data structures and provides
+ * functions that clarify the mark-adapt-partition cycle.  There is also an
+ * element iterator that can replace the nested loops over trees and tree
+ * quadrants, respectively, which can help make application code cleaner.
+ */
+
 #include <p4est_mesh.h>
+#include <p4est_extended.h>
+#include <sc_refcount.h>
 
 SC_EXTERN_C_BEGIN;
 
@@ -42,12 +53,34 @@ typedef struct p4est_wrap
   /* this member is never used or changed by p4est_wrap */
   void               *user_pointer;     /**< Convenience member for users */
 
+  /** If true, this wrap has NULL for ghost, mesh, and flag members.
+   * If false, they are properly allocated and kept current internally. */
+  int                 hollow;
+
+  /** Non-negative integer tells us how many adaptations to wait
+   * before any given quadrent may be coarsened again. */
+  int                 coarsen_delay;
+
+  /** Boolean: If true, we delay coarsening not only after refinement,
+   * but also between subsequent coarsenings of the same quadrant. */
+  int                 coarsen_affect;
+
+  /** This reference counter is a workaround for internal use only.
+   * Until we have refcounting/copy-on-write for the connectivity,
+   * we count the references to conn by copies of this wrap structure.
+   * There must be no external references left when this wrap is destroyed.
+   */
+  sc_refcount_t       conn_rc;
+  p4est_connectivity_t *conn;
+  struct p4est_wrap  *conn_owner;
+
   /* these members are considered public and read-only */
   int                 p4est_dim;
   int                 p4est_half;
   int                 p4est_faces;
   int                 p4est_children;
-  p4est_connectivity_t *conn;
+  p4est_connect_type_t btype;
+  p4est_replace_t     replace_fn;
   p4est_t            *p4est;    /**< p4est->user_pointer is used internally */
 
   /* anything below here is considered private und should not be touched */
@@ -65,6 +98,8 @@ typedef struct p4est_wrap
 p4est_wrap_t;
 
 /** Create a p4est wrapper from a given connectivity structure.
+ * The ghost and mesh members are initialized as well as the flags.
+ * The btype is set to P4EST_CONNECT_FULL.
  * \param [in] mpicomm        We expect sc_MPI_Init to be called already.
  * \param [in] conn           Connectivity structure.  Wrap takes ownership.
  * \param [in] initial_level  Initial level of uniform refinement.
@@ -74,6 +109,63 @@ p4est_wrap_t       *p4est_wrap_new_conn (sc_MPI_Comm mpicomm,
                                          p4est_connectivity_t * conn,
                                          int initial_level);
 
+/** Create a wrapper for a given p4est structure.
+ * \param [in,out] p4est      Valid p4est object that we will own.
+ *                            We take ownership of its connectivity too.
+ *                            Its user pointer must be NULL and will be changed.
+ * \param [in] hollow         Do not allocate flags, ghost, and mesh members.
+ * \param [in] btype          The neighborhood used for balance, ghost, mesh.
+ * \param [in] replace_fn     Callback to replace quadrants during refinement,
+ *                            coarsening or balancing in \ref p4est_wrap_adapt.
+ *                            May be NULL.
+ * \param [in] user_pointer   Set the user pointer in \ref p4est_wrap_t.
+ *                            Subsequently, we will never access it.
+ * \return                    A fully initialized p4est_wrap structure.
+ */
+p4est_wrap_t       *p4est_wrap_new_p4est (p4est_t * p4est, int hollow,
+                                          p4est_connect_type_t btype,
+                                          p4est_replace_t replace_fn,
+                                          void *user_pointer);
+
+/** Create a p4est wrapper from a given connectivity structure.
+ * Like p4est_wrap_new_conn, but with extra parameters \a hollow and \a btype.
+ * \param [in] mpicomm        We expect sc_MPI_Init to be called already.
+ * \param [in] conn           Connectivity structure.  Wrap takes ownership.
+ * \param [in] initial_level  Initial level of uniform refinement.
+ *                            No effect if less/equal to zero.
+ * \param [in] hollow         Do not allocate flags, ghost, and mesh members.
+ * \param [in] btype          The neighborhood used for balance, ghost, mesh.
+ * \param [in] replace_fn     Callback to replace quadrants during refinement,
+ *                            coarsening or balancing in \ref p4est_wrap_adapt.
+ *                            May be NULL.
+ * \param [in] user_pointer   Set the user pointer in \ref p4est_wrap_t.
+ *                            Subsequently, we will never access it.
+ * \return                    A fully initialized p4est_wrap structure.
+ */
+p4est_wrap_t       *p4est_wrap_new_ext (sc_MPI_Comm mpicomm,
+                                        p4est_connectivity_t * conn,
+                                        int initial_level, int hollow,
+                                        p4est_connect_type_t btype,
+                                        p4est_replace_t replace_fn,
+                                        void *user_pointer);
+
+/** Create a p4est wrapper from an existing one.
+ * \note This wrapper must be destroyed before the original one.
+ * We set it to hollow and copy the original p4est data structure.
+ * \param [in,out] source   We access the source for debugging purposes.
+ * \param [in] data_size    The data size installed in the copied forest.
+ * \param [in] replace_fn     Callback to replace quadrants during refinement,
+ *                            coarsening or balancing in \ref p4est_wrap_adapt.
+ *                            May be NULL.
+ * \param [in] user_pointer   Set the user pointer in \ref p4est_wrap_t.
+ *                            Subsequently, we will never access it.
+ * \return                    A fully initialized p4est_wrap structure.
+ */
+p4est_wrap_t       *p4est_wrap_new_copy (p4est_wrap_t * source,
+                                         size_t data_size,
+                                         p4est_replace_t replace_fn,
+                                         void *user_pointer);
+
 /** Create p4est and auxiliary data structures.
  * Expects sc_MPI_Init to be called beforehand.
  */
@@ -93,26 +185,61 @@ p4est_wrap_t       *p4est_wrap_new_cubed (sc_MPI_Comm mpicomm,
                                           int initial_level);
 p4est_wrap_t       *p4est_wrap_new_disk (sc_MPI_Comm mpicomm,
                                          int initial_level);
+p4est_wrap_t       *p4est_wrap_new_brick (sc_MPI_Comm mpicomm,
+                                          int bx, int by, int px, int py,
+                                          int initial_level);
 
 /** Passes sc_MPI_COMM_WORLD to p4est_wrap_new_unitsquare. */
 p4est_wrap_t       *p4est_wrap_new_world (int initial_level);
 void                p4est_wrap_destroy (p4est_wrap_t * pp);
 
+/** Change hollow status of the wrap.
+ * A wrap is hollow if the flags, ghost, and mesh members are NULL.
+ * Legal to set to current hollow status, in which case wrap is not changed.
+ * If changed from not hollow to hollow, previously set refinement and
+ * coarsening flags are zeroed.
+ * \param [in,out] pp   The present wrap structure, hollow or not.
+ * \param [in] hollow   The desired hollow status.  If set to hollow,
+ *                      refinement flags are zeroed.
+ */
+void                p4est_wrap_set_hollow (p4est_wrap_t * pp, int hollow);
+
+/** Set a parameter that delays coarsening after adaptation.
+ * If positive each quadrant counts the number of adaptations it has survived.
+ * Calling this function initializes all quadrant counters to zero.
+ * On adaptation we only coarsen a quadrant if it is old enough.
+ * Optionally, we can also delay the time between subsequent coarsenings.
+ * \param [in,out] pp           A valid p4est_wrap structure.
+ * \param [in] coarsen_delay    Set how many adaptation cycles a quadrant has
+ *                              to wait to be allowed to coarsen.
+ *                              Non-negative number; 0 disables the feature.
+ *                              Suggested default value: not larger than 2.
+ * \param [in] coarsen_affect   Boolean; If true, we not only count from the
+ *                              most recent refinement but also between
+ *                              subsequent coarsenings.
+ *                              Suggested default: 0.
+ */
+void                p4est_wrap_set_coarsen_delay (p4est_wrap_t * pp,
+                                                  int coarsen_delay,
+                                                  int coarsen_affect);
+
 /** Return the appropriate ghost layer.
  * This function is necessary since two versions may exist simultaneously
  * after refinement and before partition/complete.
+ * \param [in] pp   Must have !pp->hollow.
  * */
 p4est_ghost_t      *p4est_wrap_get_ghost (p4est_wrap_t * pp);
 
 /** Return the appropriate mesh structure.
  * This function is necessary since two versions may exist simultaneously
  * after refinement and before partition/complete.
+ * \param [in] pp   Must have !pp->hollow.
  * */
 p4est_mesh_t       *p4est_wrap_get_mesh (p4est_wrap_t * pp);
 
 /** Mark a local element for refinement.
  * This will cancel any coarsening mark set previously for this element.
- * \param [in,out] wrap The p4est wrapper to work with.
+ * \param [in,out] pp The p4est wrapper to work with, must not be hollow.
  * \param [in] which_tree The number of the tree this element lives in.
  * \param [in] which_quad The number of this element relative to its tree.
  */
@@ -122,7 +249,7 @@ void                p4est_wrap_mark_refine (p4est_wrap_t * pp,
 
 /** Mark a local element for coarsening.
  * This will cancel any refinement mark set previously for this element.
- * \param [in,out] wrap The p4est wrapper to work with.
+ * \param [in,out] pp The p4est wrapper to work with, must not be hollow.
  * \param [in] which_tree The number of the tree this element lives in.
  * \param [in] which_quad The number of this element relative to its tree.
  */
@@ -134,6 +261,7 @@ void                p4est_wrap_mark_coarsen (p4est_wrap_t * pp,
  * Checks pp->flags as per-quadrant input against p4est_wrap_flags_t.
  * The pp->flags array is updated along with p4est and reset to zeros.
  * Creates ghost_aux and mesh_aux to represent the intermediate mesh.
+ * \param [in,out] pp The p4est wrapper to work with, must not be hollow.
  * \return          boolean whether p4est has changed.
  *                  If true, partition must be called.
  *                  If false, partition must not be called, and
@@ -145,22 +273,42 @@ int                 p4est_wrap_adapt (p4est_wrap_t * pp);
  * Frees the old ghost and mesh first and updates pp->flags along with p4est.
  * The pp->flags array is reset to zeros.
  * Creates ghost and mesh to represent the new mesh.
+ * \param [in,out] pp The p4est wrapper to work with, must not be hollow.
  * \param [in] weight_exponent      Integer weight assigned to each leaf
  *                  according to 2 ** (level * exponent).  Passing 0 assigns
  *                  equal weight to all leaves.  Passing 1 increases the
  *                  leaf weight by a factor of two for each level increase.
  *                  CURRENTLY ONLY 0 AND 1 ARE LEGAL VALUES.
+ * \param [out] unchanged_first
+ *                  If not NULL, is assigned the processor-local index of the
+ *                  first local quadrant that has stayed on this processor.  If
+ *                  no quadrant has stayed, the value is set to zero.
+ *                  This number is in reference to the new (output) partition.
+ * \param [out] unchanged_length
+ *                  If not NULL, is assigned the number of quadrants that have
+ *                  stayed on this processor.  If no quadrant has stayed, the
+ *                  value is set to zero.
+ * \param [out] unchanged_old_first
+ *                  If not NULL, is assigned the processor-local index of the
+ *                  first local quadrant that has stayed with reference to
+ *                  the old (input) partition.  If no quadrant has stayed,
+ *                  the value is set to zero.
  * \return          boolean whether p4est has changed.
  *                  If true, complete must be called.
  *                  If false, complete must not be called.
  */
 int                 p4est_wrap_partition (p4est_wrap_t * pp,
-                                          int weight_exponent);
+                                          int weight_exponent,
+                                          p4est_locidx_t * unchanged_first,
+                                          p4est_locidx_t * unchanged_length,
+                                          p4est_locidx_t *
+                                          unchanged_old_first);
 
 /** Free memory for the intermediate mesh.
  * Sets mesh_aux and ghost_aux to NULL.
  * This function must be used if both refinement and partition effect changes.
  * After this call, we are ready for another mark-refine-partition cycle.
+ * \param [in,out] pp The p4est wrapper to work with, must not be hollow.
  */
 void                p4est_wrap_complete (p4est_wrap_t * pp);
 
@@ -168,27 +316,50 @@ void                p4est_wrap_complete (p4est_wrap_t * pp);
 
 typedef struct p4est_wrap_leaf
 {
-  p4est_wrap_t       *pp;
+  p4est_wrap_t       *pp;             /**< Must contain a valid ghost */
+
+  /* Information about the current quadrant */
+  p4est_topidx_t      which_tree;     /**< Current tree number */
+  p4est_locidx_t      which_quad;     /**< Quadrant number relative to tree */
+  p4est_locidx_t      local_quad;     /**< Quadrant number relative to proc */
+  p4est_tree_t       *tree;           /**< Current tree */
+  sc_array_t         *tquadrants;     /**< Current tree's quadrants */
+  p4est_quadrant_t   *quad;           /**< Current quadrant */
+#if 0                           /* DEPRECATED -- anyone using them? */
   int                 level;
-  p4est_topidx_t      which_tree;
-  p4est_locidx_t      which_quad;
-  p4est_locidx_t      total_quad;
-  p4est_tree_t       *tree;
-  p4est_quadrant_t   *quad;
   double              lowerleft[3];
   double              upperright[3];
+#endif
+
+  /* Information about parallel neighbors */
+  int                 is_mirror;      /**< Quadrant at parallel boundary? */
+  sc_array_t         *mirrors;        /**< If not NULL, from pp's ghost */
+  p4est_locidx_t      nm;             /**< Internal: mirror counter */
+  p4est_locidx_t      next_mirror_quadrant;     /**< Internal: next */
 }
 p4est_wrap_leaf_t;
 
-/* Create an iterator over the leaves in the forest.
+/** Determine whether we have just entered a different tree */
+#define P4EST_LEAF_IS_FIRST_IN_TREE(wleaf) ((wleaf)->which_quad == 0)
+
+/* Create an iterator over the local leaves in the forest.
  * Returns a newly allocated state containing the first leaf,
  * or NULL if the local partition of the tree is empty.
+ * \param [in] pp   Legal p4est_wrap structure, hollow or not.
+ * \param [in] track_mirrors    If true, \a pp must not be hollow and mirror
+ *                              information from the ghost layer is stored.
+ * \return          NULL if processor is empty, otherwise a leaf iterator for
+ *                  subsequent use with \a p4est_wrap_leaf_next.
  */
-p4est_wrap_leaf_t  *p4est_wrap_leaf_first (p4est_wrap_t * pp);
+p4est_wrap_leaf_t  *p4est_wrap_leaf_first (p4est_wrap_t * pp,
+                                           int track_mirrors);
 
 /* Move the forest leaf iterator forward.
- * Returns the state that was input with information for the next leaf,
- * or NULL and deallocates the input if called with the last leaf.
+ * \param [in,out] leaf     A non-NULL leaf iterator created by
+ *                          \ref p4est_wrap_leaf_first.
+ * \return          The state that was input with updated information for the
+ *                  next leaf, or NULL and deallocates the input if called with
+ *                  the last leaf on this processor.
  */
 p4est_wrap_leaf_t  *p4est_wrap_leaf_next (p4est_wrap_leaf_t * leaf);
 
diff --git a/example/p6est/p6est.c b/src/p6est.c
similarity index 85%
rename from example/p6est/p6est.c
rename to src/p6est.c
index b16320b..d1a1fc5 100644
--- a/example/p6est/p6est.c
+++ b/src/p6est.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -24,9 +25,11 @@
 #include <p6est.h>
 #include <p6est_ghost.h>
 #include <p6est_profile.h>
+#include <p6est_communication.h>
 #include <p4est_lnodes.h>
 #include <p8est.h>
 #include <p4est_extended.h>
+#include <p4est_algorithms.h>
 #include <sc_containers.h>
 #include <p4est_communication.h>
 #include <sc_io.h>
@@ -330,6 +333,7 @@ p6est_memory_used (p6est_t * p6est)
 typedef struct p6est_init_data
 {
   int                 min_zlevel;
+  int                 num_zroot;
   sc_array_t         *layers;
   p6est_init_t        init_fn;
   void               *user_pointer;
@@ -342,7 +346,9 @@ p6est_init_fn (p4est_t * p4est, p4est_topidx_t which_tree,
 {
   p6est_t            *p6est = (p6est_t *) p4est->user_pointer;
   p6est_init_data_t  *init_data = (p6est_init_data_t *) p6est->user_pointer;
-  int                 nlayers = 1 << (init_data->min_zlevel);
+  int                 log_zroot = SC_LOG2_32 (init_data->num_zroot - 1) + 1;
+  int                 nlayers =
+    (1 << (init_data->min_zlevel - log_zroot)) * init_data->num_zroot;
   sc_array_t         *layers = init_data->layers;
   size_t              incount = layers->elem_count, zz;
   size_t              last = incount + nlayers;
@@ -371,6 +377,7 @@ p6est_init_fn (p4est_t * p4est, p4est_topidx_t which_tree,
 p6est_t            *
 p6est_new_ext (sc_MPI_Comm mpicomm, p6est_connectivity_t * connectivity,
                p4est_locidx_t min_quadrants, int min_level, int min_zlevel,
+               int num_zroot,
                int fill_uniform, size_t data_size, p6est_init_t init_fn,
                void *user_pointer)
 {
@@ -380,7 +387,9 @@ p6est_new_ext (sc_MPI_Comm mpicomm, p6est_connectivity_t * connectivity,
   sc_mempool_t       *user_data_pool;
   p6est_init_data_t   init_data;
   int                 mpiret, num_procs, rank;
-  int                 quadpercol = (1 << min_zlevel);
+  int                 log_zroot = SC_LOG2_32 (num_zroot - 1) + 1;
+  int                 quadpercol =
+    (1 << (min_zlevel - log_zroot)) * num_zroot;
   int                 i;
 
   P4EST_GLOBAL_PRODUCTIONF
@@ -388,11 +397,6 @@ p6est_new_ext (sc_MPI_Comm mpicomm, p6est_connectivity_t * connectivity,
      (long long) min_quadrants, SC_MAX (min_zlevel, 0));
   p4est_log_indent_push ();
 
-  mpiret = sc_MPI_Comm_size (mpicomm, &num_procs);
-  SC_CHECK_MPI (mpiret);
-  mpiret = sc_MPI_Comm_rank (mpicomm, &rank);
-  SC_CHECK_MPI (mpiret);
-
   layers = sc_array_new (sizeof (p2est_quadrant_t));
 
   if (data_size > 0) {
@@ -404,18 +408,27 @@ p6est_new_ext (sc_MPI_Comm mpicomm, p6est_connectivity_t * connectivity,
 
   p6est->layer_pool = sc_mempool_new (sizeof (p2est_quadrant_t));
 
-  p6est->mpicomm = mpicomm;
-  p6est->mpisize = num_procs;
-  p6est->mpirank = rank;
   p6est->data_size = data_size;
   p6est->user_pointer = user_pointer;
   p6est->connectivity = connectivity;
   p6est->layers = layers;
   p6est->user_data_pool = user_data_pool;
+  p6est->root_len = num_zroot * P4EST_QUADRANT_LEN (log_zroot);
+
+  p6est_comm_parallel_env_assign (p6est, mpicomm);
+  mpicomm = p6est->mpicomm;
+
+  mpiret = sc_MPI_Comm_size (mpicomm, &num_procs);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (mpicomm, &rank);
+  SC_CHECK_MPI (mpiret);
 
   P4EST_ASSERT (min_zlevel <= P4EST_QMAXLEVEL);
+  P4EST_ASSERT (num_zroot >= 1);
+  P4EST_ASSERT (min_zlevel >= log_zroot);
 
   init_data.min_zlevel = min_zlevel;
+  init_data.num_zroot = num_zroot;
   init_data.layers = layers;
   init_data.init_fn = init_fn;
   init_data.user_pointer = user_pointer;
@@ -451,7 +464,7 @@ p6est_t            *
 p6est_new (sc_MPI_Comm mpicomm, p6est_connectivity_t * connectivity,
            size_t data_size, p6est_init_t init_fn, void *user_pointer)
 {
-  return p6est_new_ext (mpicomm, connectivity, 0, 0, 0, 1,
+  return p6est_new_ext (mpicomm, connectivity, 0, 0, 0, 1, 1,
                         data_size, init_fn, user_pointer);
 }
 
@@ -486,9 +499,6 @@ p6est_new_from_p4est (p4est_t * p4est, double *top_vertices, double height[3],
 
   p6est->layer_pool = sc_mempool_new (sizeof (p2est_quadrant_t));
 
-  p6est->mpicomm = p4est->mpicomm;
-  p6est->mpisize = num_procs = p4est->mpisize;
-  p6est->mpirank = p4est->mpirank;
   p6est->data_size = data_size;
   p6est->user_pointer = user_pointer;
   p6est->connectivity = conn;
@@ -496,6 +506,10 @@ p6est_new_from_p4est (p4est_t * p4est, double *top_vertices, double height[3],
   p6est->user_data_pool = user_data_pool;
   p6est->columns = p4est_copy (p4est, 0);
   p6est->columns->connectivity = conn->conn4;
+  p6est->root_len = P4EST_ROOT_LEN;
+
+  p6est_comm_parallel_env_assign (p6est, p4est->mpicomm);
+  num_procs = p6est->mpisize;
 
   P4EST_ASSERT (min_zlevel <= P4EST_QMAXLEVEL);
 
@@ -503,6 +517,7 @@ p6est_new_from_p4est (p4est_t * p4est, double *top_vertices, double height[3],
   init_data.layers = layers;
   init_data.init_fn = init_fn;
   init_data.user_pointer = user_pointer;
+  init_data.num_zroot = 1;
   p6est->user_pointer = &init_data;
 
   p4est_reset_data (p6est->columns, 0, p6est_init_fn, (void *) p6est);
@@ -541,11 +556,14 @@ p6est_destroy (p6est_t * p6est)
   }
   sc_array_destroy (p6est->layers);
 
-  p4est_destroy (p6est->columns);
+  if (p6est->columns) {
+    p4est_destroy (p6est->columns);
+  }
   if (p6est->user_data_pool != NULL) {
     sc_mempool_destroy (p6est->user_data_pool);
   }
   sc_mempool_destroy (p6est->layer_pool);
+  p6est_comm_parallel_env_release (p6est);
   P4EST_FREE (p6est->global_first_layer);
   P4EST_FREE (p6est);
 }
@@ -553,14 +571,27 @@ p6est_destroy (p6est_t * p6est)
 p6est_t            *
 p6est_copy (p6est_t * input, int copy_data)
 {
+  return p6est_copy_ext (input, copy_data, 0);
+}
+
+p6est_t            *
+p6est_copy_ext (p6est_t * input, int copy_data, int duplicate_comm)
+{
   p6est_t            *p6est = P4EST_ALLOC (p6est_t, 1);
   size_t              zz, qcount = input->layers->elem_count;
 
   memcpy (p6est, input, sizeof (p6est_t));
+
+  /* set parallel environment */
+  p6est_comm_parallel_env_assign (p6est, input->mpicomm);
+  if (duplicate_comm) {
+    p6est_comm_parallel_env_duplicate (p6est);
+  }
   p6est->layers =
     sc_array_new_size (input->layers->elem_size, input->layers->elem_count);
   sc_array_copy (p6est->layers, input->layers);
   p6est->columns = p4est_copy (input->columns, 0);
+  p4est_comm_parallel_env_assign (p6est->columns, p6est->mpicomm);
   p6est->columns->user_pointer = p6est;
   if (copy_data && p6est->data_size > 0) {
     p6est->user_data_pool = sc_mempool_new (p6est->data_size);
@@ -662,9 +693,9 @@ p6est_save_ext (const char *filename, p6est_t * p6est,
   int                 retval, mpiret;
   p4est_t            *savecolumns;
 #ifdef P4EST_MPIIO_WRITE
-  MPI_File            mpifile;
-  MPI_Offset          mpipos;
-  MPI_Offset          mpithis;
+  sc_MPI_File         mpifile;
+  sc_MPI_Offset       mpipos;
+  sc_MPI_Offset       mpithis;
 #else
   long                fthis;
   sc_MPI_Status       mpistatus;
@@ -906,7 +937,7 @@ p6est_load_ext (const char *filename, sc_MPI_Comm mpicomm, size_t data_size,
   size_t              save_data_size, comb_size;
   uint64_t            u64a;
   p4est_gloidx_t     *gfl;
-  int                 rank, mpisize;
+  int                 rank;
 
   P4EST_GLOBAL_PRODUCTIONF ("Into p6est_load %s\n", filename);
   p4est_log_indent_push ();
@@ -980,9 +1011,9 @@ p6est_load_ext (const char *filename, sc_MPI_Comm mpicomm, size_t data_size,
   p6est->columns = columns;
   p6est->connectivity = conn;
   p6est->data_size = data_size;
-  p6est->mpicomm = mpicomm;
-  p6est->mpisize = mpisize = columns->mpisize;
-  p6est->mpirank = rank = columns->mpirank;
+  p6est_comm_parallel_env_assign (p6est, mpicomm);
+  mpicomm = p6est->mpicomm;
+  rank = p6est->mpirank;
   p6est->global_first_layer = gfl = P4EST_ALLOC (p4est_gloidx_t,
                                                  p6est->mpisize + 1);
   p6est->layers =
@@ -1212,8 +1243,8 @@ p6est_refine_columns_ext (p6est_t * p6est, int refine_recursive,
 
   P4EST_GLOBAL_PRODUCTIONF ("Into p6est_refine_columns with %lld total layers"
                             " in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants);
   p4est_log_indent_push ();
   refine_col.refine_col_fn = refine_fn;
@@ -1233,8 +1264,8 @@ p6est_refine_columns_ext (p6est_t * p6est, int refine_recursive,
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTIONF ("Done p6est_refine_columns with %lld total layers"
                             " in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants);
 }
 
@@ -1262,8 +1293,8 @@ p6est_refine_layers_ext (p6est_t * p6est, int refine_recursive,
 
   P4EST_GLOBAL_PRODUCTIONF ("Into p6est_refine_layers with %lld total layers"
                             " in %lld total columns, allowed level %d\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants,
                             allowed_level);
   p4est_log_indent_push ();
@@ -1341,8 +1372,8 @@ p6est_refine_layers_ext (p6est_t * p6est, int refine_recursive,
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTIONF ("Done p6est_refine_layers with %lld total layers "
                             " in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants);
 }
 
@@ -1496,7 +1527,10 @@ p6est_coarsen_all_layers (p6est_t * p6est, p4est_topidx_t which_tree,
     }
     else {
       prevq[stackheight++] = *q;
-      if (q->level > ancestor_level) {
+      /* stop if a) we are as big as the root ancestor, or
+       * b) if we have reached the end of the root domain */
+      if (q->level > ancestor_level &&
+          ((q->z + P4EST_QUADRANT_LEN (q->level)) < p6est->root_len)) {
         P4EST_ASSERT (zz < old_count);
         q = p2est_quadrant_array_index (descendants, zz++);
       }
@@ -1531,7 +1565,8 @@ p6est_coarsen_all_layers (p6est_t * p6est, p4est_topidx_t which_tree,
 
   q = p2est_quadrant_array_index (descendants, new_count - 1);
   endpos = q->z + P4EST_QUADRANT_LEN (q->level);
-  P4EST_ASSERT (endpos - startpos == P4EST_QUADRANT_LEN (ancestor_level));
+  P4EST_ASSERT ((endpos - startpos == P4EST_QUADRANT_LEN (ancestor_level)) ||
+                (startpos == 0 && endpos == p6est->root_len));
 
 #endif
 }
@@ -1702,8 +1737,8 @@ p6est_coarsen_layers_ext (p6est_t * p6est, int coarsen_recursive,
 
   P4EST_GLOBAL_PRODUCTIONF ("Into p6est_coarsen_layers with %lld total layers"
                             " in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants);
   p4est_log_indent_push ();
 
@@ -1803,11 +1838,9 @@ gloidx_compare_overlap (const void *key, const void *array)
   }
 }
 
-p4est_gloidx_t
-p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
-                     p6est_weight_t weight_fn)
+static              p4est_gloidx_t
+p6est_partition_after_p4est (p6est_t * p6est)
 {
-  p6est_weight_column_t wc;
   size_t              zz, offset, count, first, last;
   p4est_gloidx_t      my_count;
   p4est_gloidx_t     *new_gfl, *old_gfl = p6est->global_first_layer;
@@ -1840,23 +1873,6 @@ p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
   sc_array_t         *recv_procs;
   p4est_gloidx_t      shipped;
   int                *ip;
-  void               *orig_user_pointer = p6est->user_pointer;
-
-  P4EST_GLOBAL_PRODUCTIONF ("Into p6est_parition with %lld total layers"
-                            " in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
-                            (long long) p6est->columns->global_num_quadrants);
-  p4est_log_indent_push ();
-  /* wrap the p6est_weight_t in a p4est_weight_t */
-  wc.layer_weight_fn = weight_fn;
-  wc.user_pointer = orig_user_pointer;
-  p6est->user_pointer = &wc;
-  p6est_compress_columns (p6est);
-  /* repartition the columns */
-  p4est_partition_ext (p6est->columns, partition_for_coarsening,
-                       p6est_weight_fn);
-  p6est->user_pointer = orig_user_pointer;
 
   /* the column counts (last - first) are correct, but not the offsets. update
    * the offsets */
@@ -1943,12 +1959,17 @@ p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
   }
 
   /* find the first proc that owns layers to send to me */
-  search = sc_array_bsearch (&old_gfl_bsearch, new_gfl + rank,
-                             gloidx_compare_overlap);
-  P4EST_ASSERT (search >= 0 && search < mpisize);
-  overlap = search;
-  P4EST_ASSERT (old_gfl[overlap] <= new_gfl[rank] &&
-                new_gfl[rank] < old_gfl[overlap + 1]);
+  if (new_gfl[rank] == new_gfl[mpisize]) {
+    overlap = mpisize;
+  }
+  else {
+    search = sc_array_bsearch (&old_gfl_bsearch, new_gfl + rank,
+                               gloidx_compare_overlap);
+    P4EST_ASSERT (search >= 0 && search < mpisize);
+    overlap = search;
+    P4EST_ASSERT (old_gfl[overlap] <= new_gfl[rank] &&
+                  new_gfl[rank] < old_gfl[overlap + 1]);
+  }
 
   offset = new_gfl[rank];
   self_offset = my_count;
@@ -1997,13 +2018,18 @@ p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
                               old_layers->elem_count);
   }
 
-  /* find the first proc that owns layers to send to me */
-  search = sc_array_bsearch (&new_gfl_bsearch, old_gfl + rank,
-                             gloidx_compare_overlap);
-  P4EST_ASSERT (search >= 0 && search < mpisize);
-  overlap = search;
-  P4EST_ASSERT (new_gfl[overlap] <= old_gfl[rank] &&
-                old_gfl[rank] < new_gfl[overlap + 1]);
+  /* find the first proc that owns layers to receive from me */
+  if (old_gfl[rank] == old_gfl[mpisize]) {
+    overlap = mpisize;
+  }
+  else {
+    search = sc_array_bsearch (&new_gfl_bsearch, old_gfl + rank,
+                               gloidx_compare_overlap);
+    P4EST_ASSERT (search >= 0 && search < mpisize);
+    overlap = search;
+    P4EST_ASSERT (new_gfl[overlap] <= old_gfl[rank] &&
+                  old_gfl[rank] < new_gfl[overlap + 1]);
+  }
 
   offset = old_gfl[rank];
   /* for every proc whose new range overlaps with this rank's old range */
@@ -2114,6 +2140,35 @@ p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
   sc_array_destroy (send_requests);
   sc_array_destroy (old_layers);
 
+  return shipped;
+}
+
+p4est_gloidx_t
+p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
+                     p6est_weight_t weight_fn)
+{
+  p6est_weight_column_t wc;
+  p4est_gloidx_t      shipped;
+  void               *orig_user_pointer = p6est->user_pointer;
+
+  P4EST_GLOBAL_PRODUCTIONF ("Into p6est_parition with %lld total layers"
+                            " in %lld total columns\n",
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
+                            (long long) p6est->columns->global_num_quadrants);
+  p4est_log_indent_push ();
+  /* wrap the p6est_weight_t in a p4est_weight_t */
+  wc.layer_weight_fn = weight_fn;
+  wc.user_pointer = orig_user_pointer;
+  p6est->user_pointer = &wc;
+  p6est_compress_columns (p6est);
+  /* repartition the columns */
+  p4est_partition_ext (p6est->columns, partition_for_coarsening,
+                       p6est_weight_fn);
+  p6est->user_pointer = orig_user_pointer;
+
+  shipped = p6est_partition_after_p4est (p6est);
+
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTIONF
     ("Done p6est_partition shipped %lld layers %.3g%%\n",
@@ -2123,6 +2178,264 @@ p6est_partition_ext (p6est_t * p6est, int partition_for_coarsening,
   return shipped;
 }
 
+/* make arbitrary partition respect columns: any partition boundary that was
+ * in the middle of a column is moved to the end of the column  */
+void
+p6est_partition_correct (p6est_t * p6est, p4est_locidx_t * num_layers_in_proc)
+{
+  int                 mpisize = p6est->mpisize;
+  int                 mpirank = p6est->mpirank;
+  int                 p, mpiret;
+  p4est_gloidx_t     *new_offsets;
+  p4est_gloidx_t     *new_offsets_recv;
+  p4est_gloidx_t      my_start = p6est->global_first_layer[mpirank];
+  p4est_gloidx_t      my_end = p6est->global_first_layer[mpirank + 1];
+  p4est_gloidx_t      offset = 0;
+
+  new_offsets = P4EST_ALLOC_ZERO (p4est_gloidx_t, mpisize);
+  new_offsets_recv = P4EST_ALLOC (p4est_gloidx_t, mpisize);
+
+  new_offsets[mpisize] = p6est->global_first_layer[mpisize];
+
+  for (p = 0; p < mpisize; p++) {
+
+    if (offset >= my_start && offset < my_end) {
+      size_t              local_offset = offset - my_start;
+      p4est_topidx_t      jt;
+
+      new_offsets[p] = offset;
+      for (jt = p6est->columns->first_local_tree;
+           jt <= p6est->columns->last_local_tree; ++jt) {
+        size_t              zz;
+        p4est_tree_t       *tree =
+          p4est_tree_array_index (p6est->columns->trees, jt);
+        sc_array_t         *tquadrants = &tree->quadrants;
+
+        for (zz = 0; zz < tquadrants->elem_count; ++zz) {
+          size_t              first, last;
+          p4est_quadrant_t   *col =
+            p4est_quadrant_array_index (tquadrants, zz);
+
+          P6EST_COLUMN_GET_RANGE (col, &first, &last);
+
+          if (local_offset > first && local_offset < last) {
+            new_offsets[p] = my_start + last;
+            break;
+          }
+        }
+      }
+    }
+    if (offset == p6est->global_first_layer[mpisize]) {
+      new_offsets[p] = offset;
+    }
+    offset += num_layers_in_proc[p];
+  }
+
+  mpiret =
+    sc_MPI_Allreduce (new_offsets, new_offsets_recv, mpisize,
+                      P4EST_MPI_GLOIDX, sc_MPI_MAX, p6est->mpicomm);
+  SC_CHECK_MPI (mpiret);
+
+  for (p = 0; p < mpisize; p++) {
+    num_layers_in_proc[p] = new_offsets_recv[p + 1] - new_offsets_recv[p];
+  }
+
+  P4EST_FREE (new_offsets);
+  P4EST_FREE (new_offsets_recv);
+  return;
+}
+
+void
+p6est_partition_to_p4est_partition (p6est_t * p6est,
+                                    p4est_locidx_t * num_layers_in_proc,
+                                    p4est_locidx_t * num_columns_in_proc)
+{
+  int                 mpisize = p6est->mpisize;
+  int                 mpirank = p6est->mpirank;
+  int                 p, mpiret;
+  p4est_gloidx_t     *new_offsets;
+  p4est_gloidx_t     *new_offsets_recv;
+  p4est_gloidx_t      my_start = p6est->global_first_layer[mpirank];
+  p4est_gloidx_t      my_end = p6est->global_first_layer[mpirank + 1];
+  p4est_gloidx_t      offset = 0;
+
+  new_offsets = P4EST_ALLOC_ZERO (p4est_gloidx_t, mpisize);
+  new_offsets_recv = P4EST_ALLOC (p4est_gloidx_t, mpisize);
+
+  new_offsets[mpisize] = p6est->columns->global_num_quadrants;
+
+  for (p = 0; p < mpisize; p++) {
+
+    if (offset >= my_start && offset < my_end) {
+      size_t              local_offset = offset - my_start;
+      p4est_topidx_t      jt;
+
+      new_offsets[p] = offset;
+      for (jt = p6est->columns->first_local_tree;
+           jt <= p6est->columns->last_local_tree; ++jt) {
+        size_t              zz;
+        p4est_tree_t       *tree =
+          p4est_tree_array_index (p6est->columns->trees, jt);
+        sc_array_t         *tquadrants = &tree->quadrants;
+
+        for (zz = 0; zz < tquadrants->elem_count; ++zz) {
+          size_t              first, last;
+          p4est_quadrant_t   *col =
+            p4est_quadrant_array_index (tquadrants, zz);
+
+          P6EST_COLUMN_GET_RANGE (col, &first, &last);
+
+          if (local_offset >= first && local_offset < last) {
+            P4EST_ASSERT (local_offset == first);
+            new_offsets[p] =
+              zz + tree->quadrants_offset +
+              p6est->columns->global_first_quadrant[mpirank];
+            break;
+          }
+        }
+      }
+    }
+    if (offset == p6est->global_first_layer[mpisize]) {
+      new_offsets[p] = p6est->columns->global_num_quadrants;
+    }
+    offset += num_layers_in_proc[p];
+  }
+
+  mpiret =
+    sc_MPI_Allreduce (new_offsets, new_offsets_recv, mpisize,
+                      P4EST_MPI_GLOIDX, sc_MPI_MAX, p6est->mpicomm);
+  SC_CHECK_MPI (mpiret);
+
+  for (p = 0; p < mpisize; p++) {
+    num_columns_in_proc[p] = new_offsets_recv[p + 1] - new_offsets_recv[p];
+  }
+
+  P4EST_FREE (new_offsets);
+  P4EST_FREE (new_offsets_recv);
+  return;
+}
+
+void
+p4est_partition_to_p6est_partition (p6est_t * p6est,
+                                    p4est_locidx_t * num_columns_in_proc,
+                                    p4est_locidx_t * num_layers_in_proc)
+{
+  int                 mpisize = p6est->mpisize;
+  int                 mpirank = p6est->mpirank;
+  int                 p, mpiret;
+  p4est_gloidx_t     *new_offsets;
+  p4est_gloidx_t     *new_offsets_recv;
+  p4est_gloidx_t      my_start =
+    p6est->columns->global_first_quadrant[mpirank];
+  p4est_gloidx_t      my_end =
+    p6est->columns->global_first_quadrant[mpirank + 1];
+  p4est_gloidx_t      offset = 0;
+
+  new_offsets = P4EST_ALLOC_ZERO (p4est_gloidx_t, mpisize);
+  new_offsets_recv = P4EST_ALLOC (p4est_gloidx_t, mpisize);
+
+  new_offsets[mpisize] = p6est->global_first_layer[mpisize];
+
+  for (p = 0; p < mpisize; p++) {
+
+    if (offset >= my_start && offset < my_end) {
+      size_t              local_offset = offset - my_start;
+      p4est_topidx_t      jt;
+
+      new_offsets[p] = offset;
+      for (jt = p6est->columns->first_local_tree;
+           jt <= p6est->columns->last_local_tree; ++jt) {
+        size_t              zz;
+        p4est_tree_t       *tree =
+          p4est_tree_array_index (p6est->columns->trees, jt);
+        sc_array_t         *tquadrants = &tree->quadrants;
+
+        for (zz = 0; zz < tquadrants->elem_count; ++zz) {
+          size_t              first, last;
+          p4est_quadrant_t   *col =
+            p4est_quadrant_array_index (tquadrants, zz);
+
+          P6EST_COLUMN_GET_RANGE (col, &first, &last);
+
+          if (zz + tree->quadrants_offset == local_offset) {
+            new_offsets[p] = p6est->global_first_layer[mpirank] + first;
+            break;
+          }
+        }
+      }
+    }
+    if (offset == p6est->columns->global_num_quadrants) {
+      new_offsets[p] = p6est->global_first_layer[mpisize];
+      break;
+    }
+    offset += num_columns_in_proc[p];
+  }
+
+  mpiret =
+    sc_MPI_Allreduce (new_offsets, new_offsets_recv, mpisize,
+                      P4EST_MPI_GLOIDX, sc_MPI_MAX, p6est->mpicomm);
+  SC_CHECK_MPI (mpiret);
+
+  for (p = 0; p < mpisize; p++) {
+    num_columns_in_proc[p] = new_offsets_recv[p + 1] - new_offsets_recv[p];
+  }
+
+  P4EST_FREE (new_offsets);
+  P4EST_FREE (new_offsets_recv);
+  return;
+}
+
+p4est_gloidx_t
+p6est_partition_for_coarsening (p6est_t * p6est,
+                                p4est_locidx_t * num_layers_in_proc)
+{
+  int                 mpisize = p6est->mpisize;
+  p4est_locidx_t     *num_columns_in_proc;
+
+  num_columns_in_proc = P4EST_ALLOC (p4est_locidx_t, mpisize);
+
+  p6est_partition_to_p4est_partition (p6est, num_layers_in_proc,
+                                      num_columns_in_proc);
+  p4est_partition_for_coarsening (p6est->columns, num_columns_in_proc);
+  p4est_partition_to_p6est_partition (p6est, num_columns_in_proc,
+                                      num_layers_in_proc);
+
+  P4EST_FREE (num_columns_in_proc);
+  return 0;
+}
+
+p4est_gloidx_t
+p6est_partition_given (p6est_t * p6est, p4est_locidx_t * num_layers_in_proc)
+{
+  int                 mpisize = p6est->mpisize;
+  p4est_gloidx_t      shipped;
+  p4est_locidx_t     *num_columns_in_proc;
+
+  P4EST_GLOBAL_PRODUCTIONF ("Into p6est_parition_given with %lld total layers"
+                            " in %lld total columns\n",
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
+                            (long long) p6est->columns->global_num_quadrants);
+  p4est_log_indent_push ();
+
+  num_columns_in_proc = P4EST_ALLOC (p4est_locidx_t, mpisize);
+  p6est_partition_to_p4est_partition (p6est, num_layers_in_proc,
+                                      num_columns_in_proc);
+  /* repartition the columns */
+  p4est_partition_given (p6est->columns, num_columns_in_proc);
+  P4EST_FREE (num_columns_in_proc);
+
+  shipped = p6est_partition_after_p4est (p6est);
+
+  p4est_log_indent_pop ();
+  P4EST_GLOBAL_PRODUCTIONF
+    ("Done p6est_partition_given shipped %lld layers %.3g%%\n",
+     (long long) shipped,
+     shipped * 100. / p6est->global_first_layer[p6est->mpisize]);
+
+  return shipped;
+}
+
 p4est_gloidx_t
 p6est_partition (p6est_t * p6est, p6est_weight_t weight_fn)
 {
@@ -2184,8 +2497,8 @@ p6est_balance_ext (p6est_t * p6est, p8est_connect_type_t btype,
 
   P4EST_GLOBAL_PRODUCTIONF ("Into p6est_balance with %lld total layers"
                             " in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants);
   p4est_log_indent_push ();
 
@@ -2260,8 +2573,8 @@ p6est_balance_ext (p6est_t * p6est, p8est_connect_type_t btype,
   p4est_log_indent_pop ();
   P4EST_GLOBAL_PRODUCTIONF ("Done p6est_balance with %lld total layers "
                             "in %lld total columns\n",
-                            (long long) p6est->global_first_layer[p6est->
-                                                                  mpisize],
+                            (long long) p6est->
+                            global_first_layer[p6est->mpisize],
                             (long long) p6est->columns->global_num_quadrants);
 }
 
@@ -2276,6 +2589,7 @@ unsigned
 p2est_quadrant_checksum (sc_array_t * quadrants,
                          sc_array_t * checkarray, size_t first_quadrant)
 {
+#ifdef P4EST_HAVE_ZLIB
   int                 own_check;
   size_t              kz, qcount;
   unsigned            crc;
@@ -2311,11 +2625,16 @@ p2est_quadrant_checksum (sc_array_t * quadrants,
   }
 
   return crc;
+#else
+  SC_ABORT ("Configure did not find a recent enough zlib.  Abort.\n");
+  return 0;
+#endif
 }
 
 unsigned
 p6est_checksum (p6est_t * p6est)
 {
+#ifdef P4EST_HAVE_ZLIB
   uLong               columncrc, locallayercrc, layercrc;
   sc_array_t          checkarray;
   size_t              scount, globalscount;
@@ -2334,4 +2653,9 @@ p6est_checksum (p6est_t * p6est)
   globalscount = p6est->global_first_layer[p6est->mpisize] * 8;
 
   return adler32_combine (columncrc, layercrc, globalscount);
+#else
+  sc_abort_collective
+    ("Configure did not find a recent enough zlib.  Abort.\n");
+  return 0;
+#endif
 }
diff --git a/example/p6est/p6est.h b/src/p6est.h
similarity index 94%
rename from example/p6est/p6est.h
rename to src/p6est.h
index d079385..6e27408 100644
--- a/example/p6est/p6est.h
+++ b/src/p6est.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,6 +22,9 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
+#ifndef P6EST_H
+#define P6EST_H
+
 /** \file p6est.h
  *
  * A hybrid 2D+1D AMR extension.
@@ -46,9 +50,6 @@
  * p8est_lnodes_new().
  */
 
-#ifndef P6EST_H
-#define P6EST_H
-
 /* 2+1D refinement is based on the 2D p4est datatypes */
 #include <p4est.h>
 /* We need p8est_connect_type_t typedef from p8est_connectivity */
@@ -165,6 +166,8 @@ typedef struct p6est
   sc_MPI_Comm         mpicomm;          /**< MPI communicator */
   int                 mpisize,          /**< number of MPI processes */
                       mpirank;          /**< this process's MPI rank */
+  int                 mpicomm_owned;    /**< whether this communicator is
+                                             owned by the forest */
   size_t              data_size;        /**< size of per-quadrant p.user_data
                      (see p2est_quadrant_t::p2est_quadrant_data::user_data) */
   void               *user_pointer;     /**< convenience pointer for users,
@@ -175,12 +178,13 @@ typedef struct p6est
   sc_array_t         *layers;   /**< single array that stores
                                      p2est_quadrant_t layers within columns */
   sc_mempool_t       *user_data_pool;   /**< memory allocator for user data */
-                                        /* WARNING: This is NULL if data size
-                                         *          equals zero.  */
+  /* WARNING: This is NULL if data size
+   *          equals zero.  */
   sc_mempool_t       *layer_pool;       /**< memory allocator
                                              for temporary layers */
   p4est_gloidx_t     *global_first_layer; /**< first global quadrant index for
                                                each process and 1 beyond */
+  p4est_qcoord_t      root_len; /**< height of the domain */
 }
 p6est_t;
 
@@ -475,6 +479,26 @@ p6est_comm_tag_t;
  */
 p4est_gloidx_t      p6est_partition (p6est_t * p6est,
                                      p6est_weight_t weight_fn);
+void                p6est_partition_correct (p6est_t * p6est,
+                                             p4est_locidx_t *
+                                             num_layers_in_proc);
+void                p6est_partition_to_p4est_partition (p6est_t * p6est,
+                                                        p4est_locidx_t *
+                                                        num_layers_in_proc,
+                                                        p4est_locidx_t *
+                                                        num_columns_in_proc);
+void                p4est_partition_to_p6est_partition (p6est_t * p6est,
+                                                        p4est_locidx_t *
+                                                        num_columns_in_proc,
+                                                        p4est_locidx_t *
+                                                        num_layers_in_proc);
+
+p4est_gloidx_t      p6est_partition_for_coarsening (p6est_t * p6est,
+                                                    p4est_locidx_t *
+                                                    num_layers_in_proc);
+p4est_gloidx_t      p6est_partition_given (p6est_t * p6est,
+                                           p4est_locidx_t *
+                                           num_layers_in_proc);
 
 /** Compute the checksum for a forest.
  * Based on quadrant arrays only. It is independent of partition and mpisize.
diff --git a/src/p6est_communication.c b/src/p6est_communication.c
new file mode 100644
index 0000000..3a46027
--- /dev/null
+++ b/src/p6est_communication.c
@@ -0,0 +1,196 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#include <p4est_communication.h>
+#include <p6est_communication.h>
+
+void
+p6est_comm_parallel_env_assign (p6est_t * p6est, sc_MPI_Comm mpicomm)
+{
+  /* set MPI communicator */
+  p6est->mpicomm = mpicomm;
+  p6est->mpicomm_owned = 0;
+
+  /* retrieve MPI information */
+  p6est_comm_parallel_env_get_info (p6est);
+}
+
+void
+p6est_comm_parallel_env_duplicate (p6est_t * p6est)
+{
+  sc_MPI_Comm         mpicomm = p6est->mpicomm;
+  int                 mpiret;
+
+  /* duplicate MPI communicator */
+  mpiret = sc_MPI_Comm_dup (mpicomm, &(p6est->mpicomm));
+  SC_CHECK_MPI (mpiret);
+  p6est->mpicomm_owned = 1;
+}
+
+void
+p6est_comm_parallel_env_release (p6est_t * p6est)
+{
+  int                 mpiret;
+
+  /* free MPI communicator if it's owned */
+  if (p6est->mpicomm_owned) {
+    mpiret = sc_MPI_Comm_free (&(p6est->mpicomm));
+    SC_CHECK_MPI (mpiret);
+  }
+  p6est->mpicomm = sc_MPI_COMM_NULL;
+  p6est->mpicomm_owned = 0;
+
+  /* set MPI information */
+  p6est->mpisize = 0;
+  p6est->mpirank = sc_MPI_UNDEFINED;
+}
+
+void
+p6est_comm_parallel_env_replace (p6est_t * p6est, sc_MPI_Comm mpicomm)
+{
+  /* check if input MPI communicator has same size and same rank order */
+#ifdef P4EST_ENABLE_DEBUG
+  {
+    int                 mpiret, result;
+
+    mpiret = sc_MPI_Comm_compare (p6est->mpicomm, mpicomm, &result);
+    SC_CHECK_MPI (mpiret);
+
+    P4EST_ASSERT (result == sc_MPI_IDENT || result == sc_MPI_CONGRUENT);
+  }
+#endif
+
+  /* release the current parallel environment */
+  p6est_comm_parallel_env_release (p6est);
+
+  /* assign new MPI communicator */
+  p6est_comm_parallel_env_assign (p6est, mpicomm);
+}
+
+void
+p6est_comm_parallel_env_get_info (p6est_t * p6est)
+{
+  int                 mpiret;
+
+  mpiret = sc_MPI_Comm_size (p6est->mpicomm, &(p6est->mpisize));
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (p6est->mpicomm, &(p6est->mpirank));
+  SC_CHECK_MPI (mpiret);
+}
+
+int
+p6est_comm_parallel_env_is_null (p6est_t * p6est)
+{
+  return (p6est->mpicomm == sc_MPI_COMM_NULL);
+}
+
+int
+p6est_comm_parallel_env_reduce (p6est_t ** p6est_supercomm)
+{
+  return p6est_comm_parallel_env_reduce_ext (p6est_supercomm,
+                                             sc_MPI_GROUP_NULL, 0, NULL);
+}
+
+int
+p6est_comm_parallel_env_reduce_ext (p6est_t ** p6est_supercomm,
+                                    sc_MPI_Group group_add,
+                                    int add_to_beginning, int **ranks_subcomm)
+{
+  p6est_t            *p6est = *p6est_supercomm;
+  int                 mpisize = p6est->mpisize;
+  int                 mpiret;
+  p4est_gloidx_t     *global_first_layer = p6est->global_first_layer;
+
+  p4est_gloidx_t     *n_quadrants;
+  int                 submpisize;
+  sc_MPI_Comm         submpicomm;
+  int                *ranks;
+  int                 i;
+  int                 is_nonempty;
+
+  /* reduce MPI communicator of column layout */
+  is_nonempty =
+    p4est_comm_parallel_env_reduce_ext (&(p6est->columns), group_add,
+                                        add_to_beginning, &ranks);
+
+  /* destroy p4est and exit if this rank is empty */
+  if (!is_nonempty) {
+    p6est->columns = NULL;
+    p6est_destroy (p6est);
+    *p6est_supercomm = NULL;
+    if (ranks_subcomm) {
+      *ranks_subcomm = NULL;
+    }
+    P4EST_ASSERT (ranks == NULL);
+    return 0;
+  }
+
+  /* get sub-communicator */
+  submpicomm = p6est->columns->mpicomm;
+
+  /* update size of new MPI communicator */
+  mpiret = sc_MPI_Comm_size (submpicomm, &submpisize);
+  SC_CHECK_MPI (mpiret);
+  if (submpisize == p6est->mpisize) {
+    P4EST_ASSERT (ranks == NULL);
+    return 1;
+  }
+
+  /* set new parallel environment */
+  p6est_comm_parallel_env_release (p6est);
+  p6est_comm_parallel_env_assign (p6est, submpicomm);
+  if (p6est->columns->mpicomm_owned) {
+    p6est->columns->mpicomm_owned = 0;
+    p6est->mpicomm_owned = 1;
+  }
+  P4EST_ASSERT (p6est->mpisize == submpisize);
+
+  /* create array of non-empty processes that will be included to sub-comm */
+  n_quadrants = P4EST_ALLOC (p4est_gloidx_t, mpisize);
+  for (i = 0; i < mpisize; i++) {
+    n_quadrants[i] = global_first_layer[i + 1] - global_first_layer[i];
+  }
+
+  /* allocate and set global layer count */
+  P4EST_FREE (p6est->global_first_layer);
+  p6est->global_first_layer = P4EST_ALLOC (p4est_gloidx_t, submpisize + 1);
+  p6est->global_first_layer[0] = 0;
+  for (i = 0; i < submpisize; i++) {
+    P4EST_ASSERT (ranks[i] != sc_MPI_UNDEFINED);
+    P4EST_ASSERT (group_add != sc_MPI_GROUP_NULL
+                  || 0 < n_quadrants[ranks[i]]);
+    p6est->global_first_layer[i + 1] =
+      p6est->global_first_layer[i] + n_quadrants[ranks[i]];
+  }
+  P4EST_FREE (n_quadrants);
+  if (ranks_subcomm) {
+    *ranks_subcomm = ranks;
+  }
+  else {
+    P4EST_FREE (ranks);
+  }
+
+  /* return that p6est exists on this rank */
+  return 1;
+}
diff --git a/src/p6est_communication.h b/src/p6est_communication.h
new file mode 100644
index 0000000..be2fdee
--- /dev/null
+++ b/src/p6est_communication.h
@@ -0,0 +1,114 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef P6EST_COMMUNICATION_H
+#define P6EST_COMMUNICATION_H
+
+/** \file p6est_communication.h
+ *
+ * MPI_Comm management.
+ *
+ * \ingroup p6est
+ */
+
+#include <p6est.h>
+
+SC_EXTERN_C_BEGIN;
+
+/** Assign an MPI communicator to p6est; retrieve parallel environment.
+ *
+ * \param [in] mpicomm    A valid MPI communicator.
+ *
+ * \note The provided MPI communicator is not owned by p6est.
+ */
+void                p6est_comm_parallel_env_assign (p6est_t * p6est,
+                                                    sc_MPI_Comm mpicomm);
+
+/** Duplicate MPI communicator and replace the current one by the duplicate.
+ *
+ * \note The duplicated MPI communicator is owned by p6est.
+ */
+void                p6est_comm_parallel_env_duplicate (p6est_t * p6est);
+
+/** Release MPI communicator if it is owned by p6est.
+ */
+void                p6est_comm_parallel_env_release (p6est_t * p6est);
+
+/** Replace the current MPI communicator by the one provided as input.
+ *
+ * \param [in] mpicomm    A valid MPI communicator.
+ *
+ * \note The provided MPI communicator is not owned by p6est.
+ */
+void                p6est_comm_parallel_env_replace (p6est_t * p6est,
+                                                     sc_MPI_Comm mpicomm);
+
+/** Retrieve parallel environment information.
+ */
+void                p6est_comm_parallel_env_get_info (p6est_t * p6est);
+
+/** Check if the MPI communicator is valid.
+ *
+ * \return True if communicator is not NULL communicator, false otherwise.
+ */
+int                 p6est_comm_parallel_env_is_null (p6est_t * p6est);
+
+/** Reduce MPI communicator to non-empty ranks (i.e., nonzero quadrant counts).
+ *
+ * \param [in/out] p6est_supercomm  Object which communicator is reduced.
+ *                                  points to NULL if this p6est does not
+ *                                  exists.
+ *
+ * \return True if p6est exists on this MPI rank after reduction.
+ */
+int                 p6est_comm_parallel_env_reduce (p6est_t **
+                                                    p6est_supercomm);
+
+/** Reduce MPI communicator to non-empty ranks and add a group of ranks that
+ * will remain in the reduced communicator regardless whether they are empty
+ * or not.
+ *
+ * \param [in/out] p6est_supercomm  Object which communicator is reduced.
+ *                                  Points to NULL if this p6est does not
+ *                                  exists.
+ * \param [in] group_add         Group of ranks that will remain in
+ *                               communicator.
+ * \param [in] add_to_beginning  If true, ranks will be added to the beginning
+ *                               of the reduced communicator, otherwise to the
+ *                               end.
+ * \param[out] ranks_subcomm     If not null, array of size 'subcommsize' with
+ *                               subcommrank->supercommrank map.
+ *
+ * \return True if p6est exists on this MPI rank after reduction.
+ */
+int                 p6est_comm_parallel_env_reduce_ext (p6est_t **
+                                                        p6est_supercomm,
+                                                        sc_MPI_Group
+                                                        group_add,
+                                                        int add_to_beginning,
+                                                        int **ranks_subcomm);
+
+SC_EXTERN_C_END;
+
+#endif /* !P6EST_COMMUNICATION_H */
diff --git a/example/p6est/p6est_extended.h b/src/p6est_extended.h
similarity index 93%
rename from example/p6est/p6est_extended.h
rename to src/p6est_extended.h
index 57be005..96712c8 100644
--- a/example/p6est/p6est_extended.h
+++ b/src/p6est_extended.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,6 +22,9 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
+#ifndef P6EST_EXTENDED_H
+#define P6EST_EXTENDED_H
+
 /********************************************************************
  *                          IMPORTANT NOTE                          *
  *                                                                  *
@@ -36,11 +40,10 @@
  * \ingroup p6est
  */
 
-#ifndef P6EST_EXTENDED_H
-#define P6EST_EXTENDED_H
-
 #include <p6est.h>
 
+SC_EXTERN_C_BEGIN;
+
 /** Create a new forest.
  * This is a more general form of p6est_new().
  * See the documentation of p6est_new() for basic usage.
@@ -53,6 +56,8 @@
  * \param [in] min_zlevel       The forest is vertically refined at least to
  *                              this level.  May be negative or 0, then it has
  *                              no effect.
+ * \parem [in] num_zroot        The number of "root" vertical layers
+ *                              (used when non-power-of-2 layers are desired)
  * \param [in] fill_uniform     If true, fill the forest with a uniform mesh
  *                              instead of the coarsest possible one.
  *                              The latter is partition-specific so that
@@ -62,9 +67,24 @@ p6est_t            *p6est_new_ext (sc_MPI_Comm mpicomm,
                                    p6est_connectivity_t * connectivity,
                                    p4est_locidx_t min_quadrants,
                                    int min_level, int min_zlevel,
+                                   int num_zroot,
                                    int fill_uniform, size_t data_size,
                                    p6est_init_t init_fn, void *user_pointer);
 
+/** Make a deep copy of a p6est.
+ * The connectivity is not duplicated.
+ * Copying of quadrant user data is optional.
+ * If old and new data sizes are 0, the user_data field is copied regardless.
+ * The inspect member of the copy is set to NULL.
+ *
+ * \param [in]  copy_data  If true, data are copied.
+ *                         If false, data_size is set to 0.
+ * \param [in]  duplicate_mpicomm  If true, MPI communicator is copied.
+ * \return  Returns a valid p6est that does not depend on the input.
+ */
+p6est_t            *p6est_copy_ext (p6est_t * input, int copy_data,
+                                    int duplicate_mpicomm);
+
 /** Save the complete connectivity/p6est data to disk.
  *
  * This is a collective operation that all MPI processes need to call.  All
@@ -263,4 +283,6 @@ void                p6est_balance_ext (p6est_t * p6est,
                                        p6est_init_t init_fn,
                                        p6est_replace_t replace_fn);
 
+SC_EXTERN_C_END;
+
 #endif
diff --git a/example/p6est/p6est_ghost.c b/src/p6est_ghost.c
similarity index 99%
rename from example/p6est/p6est_ghost.c
rename to src/p6est_ghost.c
index 02a5854..40c773b 100644
--- a/example/p6est/p6est_ghost.c
+++ b/src/p6est_ghost.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/p6est/p6est_ghost.h b/src/p6est_ghost.h
similarity index 98%
rename from example/p6est/p6est_ghost.h
rename to src/p6est_ghost.h
index 334a38b..b6142c3 100644
--- a/example/p6est/p6est_ghost.h
+++ b/src/p6est_ghost.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,6 +22,9 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
+#ifndef P6EST_GHOST_H
+#define P6EST_GHOST_H
+
 /** \file p6est_ghost.h
  *
  * passing columns of layers and data to neighboring processes
@@ -28,9 +32,6 @@
  * \ingroup p6est
  */
 
-#ifndef P6EST_GHOST_H
-#define P6EST_GHOST_H
-
 #include <p6est.h>
 #include <p4est_ghost.h>
 
diff --git a/example/p6est/p6est_lnodes.c b/src/p6est_lnodes.c
similarity index 83%
rename from example/p6est/p6est_lnodes.c
rename to src/p6est_lnodes.c
index 966b25e..58291df 100644
--- a/example/p6est/p6est_lnodes.c
+++ b/src/p6est_lnodes.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -397,3 +398,82 @@ p6est_lnodes_new (p6est_t * p6est, p6est_ghost_t * ghost, int degree)
 
   return lnodes;
 }
+
+p4est_gloidx_t     *
+p6est_lnodes_get_column_labels (p6est_t * p6est, p8est_lnodes_t * lnodes)
+{
+  p4est_gloidx_t     *labels;
+  p4est_gloidx_t      num_cols = 0;
+  p4est_gloidx_t      global_num_cols = 0;
+  p4est_topidx_t      jt;
+  p4est_tree_t       *tree;
+  sc_array_t         *tquadrants;
+  p4est_quadrant_t   *col;
+  size_t              zz, first, last;
+  p4est_locidx_t      lfirst, llast, lk;
+  int                 stride = lnodes->degree + 1;
+  int                 vnodes = lnodes->vnodes;
+  int                 mpiret, i;
+
+  labels = P4EST_ALLOC (p4est_gloidx_t, lnodes->owned_count);
+  memset (labels, -1, lnodes->owned_count * sizeof (*labels));
+
+  for (jt = p6est->columns->first_local_tree;
+       jt <= p6est->columns->last_local_tree; ++jt) {
+    tree = p4est_tree_array_index (p6est->columns->trees, jt);
+    tquadrants = &tree->quadrants;
+    for (zz = 0; zz < tquadrants->elem_count; ++zz) {
+      col = p4est_quadrant_array_index (tquadrants, zz);
+      P6EST_COLUMN_GET_RANGE (col, &first, &last);
+      lfirst = (p4est_locidx_t) first;
+      llast = (p4est_locidx_t) last;
+      for (i = 0; i < vnodes; i += stride) {
+        p4est_locidx_t      fnid = lnodes->element_nodes[vnodes * lfirst + i];
+        p4est_locidx_t      lnid =
+          lnodes->element_nodes[vnodes * (llast - 1) + i + (stride - 1)];
+        P4EST_ASSERT (lnid >= 0);
+        P4EST_ASSERT (lnid >= fnid);
+        P4EST_ASSERT (fnid < lnodes->num_local_nodes);
+        if (lnid < lnodes->owned_count) {
+          P4EST_ASSERT (fnid < lnodes->owned_count);
+          if (labels[fnid] < 0) {
+            for (lk = fnid; lk <= lnid; lk++) {
+              labels[lk] = num_cols;
+            }
+            num_cols++;
+          }
+        }
+      }
+    }
+  }
+
+  mpiret =
+    sc_MPI_Exscan (&num_cols, &global_num_cols, 1, P4EST_MPI_GLOIDX,
+                   sc_MPI_SUM, lnodes->mpicomm);
+  SC_CHECK_MPI (mpiret);
+
+  if (!p6est->mpirank) {
+    global_num_cols = 0;
+  }
+
+  for (lk = 0; lk < lnodes->owned_count; lk++) {
+    labels[lk] += global_num_cols;
+  }
+
+#if 0
+  {
+    sc_array_t          view;
+
+    sc_array_init_data (&view, labels, sizeof (*labels),
+                        (size_t) lnodes->num_local_nodes);
+
+    p6est_lnodes_share_owned (&view, lnodes);
+  }
+#endif
+
+  for (lk = 0; lk < lnodes->owned_count; lk++) {
+    P4EST_ASSERT (labels[lk] >= 0);
+  }
+
+  return labels;
+}
diff --git a/example/p6est/p6est_lnodes.h b/src/p6est_lnodes.h
similarity index 94%
rename from example/p6est/p6est_lnodes.h
rename to src/p6est_lnodes.h
index cb2e7e7..ca24fc9 100644
--- a/example/p6est/p6est_lnodes.h
+++ b/src/p6est_lnodes.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -266,6 +267,18 @@ p6est_lnodes_global_index (p6est_lnodes_t * lnodes, p4est_locidx_t lidx)
   return p8est_lnodes_global_index (lnodes, lidx);
 }
 
+/** For each owned node, get the global 2D number for the node-column
+ * containing it.
+ *
+ * \param[in] p6est  The forest
+ * \param[in] lnodes The nodes
+ * \return an array of size \a lnodes->owned_count, giving the unique global
+ * number of the node-column containing each node.  Should be free'd with
+ * P4EST_FREE().
+ */
+p4est_gloidx_t     *p6est_lnodes_get_column_labels (p6est_t * p6est,
+                                                    p8est_lnodes_t * lnodes);
+
 SC_EXTERN_C_END;
 
 #endif /* !P6EST_LNODES */
diff --git a/example/p6est/p6est_profile.c b/src/p6est_profile.c
similarity index 97%
rename from example/p6est/p6est_profile.c
rename to src/p6est_profile.c
index dbcea7d..66268c9 100644
--- a/example/p6est/p6est_profile.c
+++ b/src/p6est_profile.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -182,13 +183,13 @@ p6est_profile_balance_self (sc_array_t * a, sc_array_t * work)
 }
 
 static void
-p6est_profile_balance_face_one_pass (sc_array_t * read, sc_array_t * write)
+p6est_profile_balance_face_one_pass (sc_array_t * read, sc_array_t * write,
+                                     p4est_qcoord_t readh)
 {
   int8_t             *wc;
   size_t              count;
   int                 stackcount;
   int8_t              n, nn, newn, p, l;
-  p4est_qcoord_t      readh;
   size_t              zy;
 
   P4EST_ASSERT (SC_ARRAY_IS_OWNER (write));
@@ -200,7 +201,6 @@ p6est_profile_balance_face_one_pass (sc_array_t * read, sc_array_t * write)
   sc_array_truncate (write);
   l = 0;
   zy = 0;
-  readh = 0;
   while (zy < count) {
     n = *((int8_t *) sc_array_index (read, count - 1 - zy++));
     if (n && !(readh & P4EST_QUADRANT_LEN (n))) {
@@ -225,7 +225,8 @@ p6est_profile_balance_face_one_pass (sc_array_t * read, sc_array_t * write)
 
 /* assumes a is already self balanced */
 static void
-p6est_profile_balance_face (sc_array_t * a, sc_array_t * b, sc_array_t * work)
+p6est_profile_balance_face (sc_array_t * a, sc_array_t * b, sc_array_t * work,
+                            p4est_qcoord_t diff)
 {
   P4EST_ASSERT (SC_ARRAY_IS_OWNER (b));
   P4EST_ASSERT (SC_ARRAY_IS_OWNER (work));
@@ -233,18 +234,18 @@ p6est_profile_balance_face (sc_array_t * a, sc_array_t * b, sc_array_t * work)
   P4EST_ASSERT (b->elem_size == sizeof (int8_t));
   P4EST_ASSERT (work->elem_size == sizeof (int8_t));
 
-  p6est_profile_balance_face_one_pass (a, work);
+  p6est_profile_balance_face_one_pass (a, work, diff);
   p6est_profile_balance_self_one_pass (work, b);
 }
 
 static void
-p6est_profile_balance_full_one_pass (sc_array_t * read, sc_array_t * write)
+p6est_profile_balance_full_one_pass (sc_array_t * read, sc_array_t * write,
+                                     p4est_qcoord_t readh)
 {
   int8_t             *wc;
   size_t              count;
   int                 stackcount;
   int8_t              n, nn, newn, p, l, prevl, nextl;
-  p4est_qcoord_t      readh;
   size_t              zy;
 
   P4EST_ASSERT (SC_ARRAY_IS_OWNER (write));
@@ -256,7 +257,6 @@ p6est_profile_balance_full_one_pass (sc_array_t * read, sc_array_t * write)
   sc_array_truncate (write);
   l = 0;
   zy = 0;
-  readh = 0;
   while (zy < count) {
     n = *((int8_t *) sc_array_index (read, count - 1 - zy++));
     if (n && !(readh & P4EST_QUADRANT_LEN (n))) {
@@ -295,7 +295,8 @@ p6est_profile_balance_full_one_pass (sc_array_t * read, sc_array_t * write)
 
 /* assumes a is already self balanced */
 static void
-p6est_profile_balance_full (sc_array_t * a, sc_array_t * b, sc_array_t * work)
+p6est_profile_balance_full (sc_array_t * a, sc_array_t * b, sc_array_t * work,
+                            p4est_qcoord_t diff)
 {
   P4EST_ASSERT (SC_ARRAY_IS_OWNER (b));
   P4EST_ASSERT (SC_ARRAY_IS_OWNER (work));
@@ -303,7 +304,7 @@ p6est_profile_balance_full (sc_array_t * a, sc_array_t * b, sc_array_t * work)
   P4EST_ASSERT (b->elem_size == sizeof (int8_t));
   P4EST_ASSERT (work->elem_size == sizeof (int8_t));
 
-  p6est_profile_balance_full_one_pass (a, work);
+  p6est_profile_balance_full_one_pass (a, work, diff);
   p6est_profile_balance_self_one_pass (work, b);
 }
 
@@ -533,6 +534,7 @@ p6est_profile_new_local (p6est_t * p6est,
   p4est_tree_t       *tree;
   sc_array_t         *tquadrants;
   p4est_quadrant_t   *col;
+  p4est_qcoord_t      diff = P4EST_ROOT_LEN - p6est->root_len;
   size_t              first, last, count, zz, zy;
   p4est_locidx_t     *en, (*lr)[2];
   sc_array_t         *lc;
@@ -556,6 +558,7 @@ p6est_profile_new_local (p6est_t * p6est,
   profile->lnode_changed[0] = NULL;
   profile->lnode_changed[1] = NULL;
   profile->enode_counts = NULL;
+  profile->diff = diff;
   if (btype == P8EST_CONNECT_FACE) {
     hbtype = P4EST_CONNECT_FACE;
   }
@@ -614,16 +617,16 @@ p6est_profile_new_local (p6est_t * p6est,
       if (ptype == P6EST_PROFILE_UNION) {
         p6est_profile_balance_self (selfprof, work);
         if (btype == P8EST_CONNECT_FACE) {
-          p6est_profile_balance_face (selfprof, faceprof, work);
+          p6est_profile_balance_face (selfprof, faceprof, work, diff);
         }
         else {
-          p6est_profile_balance_full (selfprof, faceprof, work);
+          p6est_profile_balance_full (selfprof, faceprof, work, diff);
         }
         if (btype == P8EST_CONNECT_EDGE) {
-          p6est_profile_balance_face (selfprof, cornerprof, work);
+          p6est_profile_balance_face (selfprof, cornerprof, work, diff);
         }
         else if (btype == P8EST_CONNECT_FULL) {
-          p6est_profile_balance_full (selfprof, cornerprof, work);
+          p6est_profile_balance_full (selfprof, cornerprof, work, diff);
         }
       }
       for (j = 0; j < Nrp; j++) {
@@ -728,6 +731,7 @@ p6est_profile_balance_local (p6est_profile_t * profile)
   int                 any_prof_change;
   int                 any_local_change;
   int                 evenodd = profile->evenodd;
+  p4est_qcoord_t      diff = profile->diff;
 
   P4EST_ASSERT (profile->lnodes->degree == 2);
 
@@ -803,16 +807,16 @@ p6est_profile_balance_local (p6est_profile_t * profile)
         P4EST_ASSERT (selfprof->elem_count > oldprof.elem_count);
         /* update */
         if (btype == P8EST_CONNECT_FACE) {
-          p6est_profile_balance_face (selfprof, faceprof, work);
+          p6est_profile_balance_face (selfprof, faceprof, work, diff);
         }
         else {
-          p6est_profile_balance_full (selfprof, faceprof, work);
+          p6est_profile_balance_full (selfprof, faceprof, work, diff);
         }
         if (btype == P8EST_CONNECT_EDGE) {
-          p6est_profile_balance_face (selfprof, cornerprof, work);
+          p6est_profile_balance_face (selfprof, cornerprof, work, diff);
         }
         else if (btype == P8EST_CONNECT_FULL) {
-          p6est_profile_balance_full (selfprof, cornerprof, work);
+          p6est_profile_balance_full (selfprof, cornerprof, work, diff);
         }
         enidx = start_enidx;
         for (j = 0; j < 3; j++) {
diff --git a/example/p6est/p6est_profile.h b/src/p6est_profile.h
similarity index 95%
rename from example/p6est/p6est_profile.h
rename to src/p6est_profile.h
index 88469ec..eb7314e 100644
--- a/example/p6est/p6est_profile.h
+++ b/src/p6est_profile.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -29,6 +30,8 @@
 #include <p4est_lnodes.h>
 #include <p6est_lnodes.h>
 
+SC_EXTERN_C_BEGIN;
+
 /** A p6est profile is used to (a) balance a p6est, and (b) generate a
  * p6est_lnodes.  In every case, layers in a column are compressed to one
  * int8_t each.  The resulting column profiles and be quickly intersected,
@@ -56,6 +59,7 @@ typedef struct p6est_profile
   int                *lnode_changed[2];
   p4est_locidx_t     *enode_counts;
   int                 evenodd;
+  p4est_qcoord_t      diff;
 }
 p6est_profile_t;
 
@@ -102,4 +106,7 @@ void                p6est_profile_element_to_node (p6est_t * p6est,
                                                    p4est_locidx_t *
                                                    elem_to_node,
                                                    p6est_lnodes_code_t * fc);
+
+SC_EXTERN_C_END;
+
 #endif /* !P6EST_PROFILE_H */
diff --git a/example/p6est/p6est_vtk.c b/src/p6est_vtk.c
similarity index 99%
rename from example/p6est/p6est_vtk.c
rename to src/p6est_vtk.c
index 86d0f1b..4fc3b29 100644
--- a/example/p6est/p6est_vtk.c
+++ b/src/p6est_vtk.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/example/p6est/p6est_vtk.h b/src/p6est_vtk.h
similarity index 98%
rename from example/p6est/p6est_vtk.h
rename to src/p6est_vtk.h
index 234759a..234d617 100644
--- a/example/p6est/p6est_vtk.h
+++ b/src/p6est_vtk.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est.c b/src/p8est.c
index c90fb08..6935c3d 100644
--- a/src/p8est.c
+++ b/src/p8est.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est.h b/src/p8est.h
index ee30860..0dd145f 100644
--- a/src/p8est.h
+++ b/src/p8est.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -57,12 +58,12 @@ SC_EXTERN_C_BEGIN;
  */
 #define P8EST_LAST_OFFSET(l) (P8EST_ROOT_LEN - P8EST_QUADRANT_LEN (l))
 
-/** The 3D quadrant (i.e. octant) datatype */
+/** The 3D quadrant (i.e., octant) datatype */
 typedef struct p8est_quadrant
 {
-  /*@{*/
+  /*@{ */
   p4est_qcoord_t      x, y, z;  /**< coordinates */
-  /*@}*/
+  /*@} */
   int8_t              level,    /**< level of refinement */
                       pad8;     /**< padding */
   int16_t             pad16;    /**< padding */
@@ -127,11 +128,13 @@ typedef struct p8est
   sc_MPI_Comm         mpicomm;          /**< MPI communicator */
   int                 mpisize,          /**< number of MPI processes */
                       mpirank;          /**< this process's MPI rank */
+  int                 mpicomm_owned;    /**< flag if communicator is owned */
   size_t              data_size;        /**< size of per-quadrant p.user_data
                      (see p8est_quadrant_t::p8est_quadrant_data::user_data) */
   void               *user_pointer;     /**< convenience pointer for users,
                                              never touched by p4est */
 
+  long                revision;         /**< Gets bumped on mesh change */
   p4est_topidx_t      first_local_tree; /**< 0-based index of first local
                                              tree, must be -1 for an empty
                                              processor */
@@ -152,22 +155,33 @@ typedef struct p8est
   sc_array_t         *trees;          /**< array of all trees */
 
   sc_mempool_t       *user_data_pool; /**< memory allocator for user data */
-                                      /*   WARNING: This is NULL if data size
-                                                    equals zero. */
+  /*   WARNING: This is NULL if data size
+     equals zero. */
   sc_mempool_t       *quadrant_pool;  /**< memory allocator for temporary
                                            quadrants */
   p8est_inspect_t    *inspect;        /**< algorithmic switches */
 }
 p8est_t;
 
-/** Calculate memory usage of a forest structure.
+/** Calculate local memory usage of a forest structure.
+ * Not collective.  The memory used on the current rank is returned.
  * The connectivity structure is not counted since it is not owned;
  * use p8est_connectivity_memory_usage (p8est->connectivity).
- * \param [in] p8est    Forest structure.
+ * \param [in] p8est    Valid forest structure.
  * \return              Memory used in bytes.
  */
 size_t              p8est_memory_used (p8est_t * p8est);
 
+/** Return the revision counter of the forest.
+ * Not collective, even though the revision value is the same on all ranks.
+ * A newly created forest starts with a revision counter of zero.
+ * Every refine, coarsen, partition, and balance that actually changes the mesh
+ * increases the counter by one.  Operations with no effect keep the old value.
+ * \param [in] p8est    The forest must be valid.
+ * \return              Non-negative number.
+ */
+long                p8est_revision (p8est_t * p8est);
+
 /** Callback function prototype to initialize the quadrant's user data.
  * \param [in] p8est         the forest
  * \param [in] which_tree    the tree containing \a quadrant
@@ -266,10 +280,13 @@ void                p8est_destroy (p8est_t * p8est);
  * Copying of quadrant user data is optional.
  * If old and new data sizes are 0, the user_data field is copied regardless.
  * The inspect member of the copy is set to NULL.
+ * The revision counter of the copy is set to zero.
  *
  * \param [in]  copy_data  If true, data are copied.
  *                         If false, data_size is set to 0.
- * \return  Returns a valid p8est that does not depend on the input.
+ * \return  Returns a valid p8est that does not depend on the input,
+ *                         except for borrowing the same connectivity.
+ *                         Its revision counter is 0.
  */
 p8est_t            *p8est_copy (p8est_t * input, int copy_data);
 
@@ -371,6 +388,9 @@ unsigned            p8est_checksum (p8est_t * p8est);
  * header.  This makes the file depend on mpisize.  For changing this see
  * p8est_save_ext() in p8est_extended.h.
  *
+ * The revision counter is not saved to the file, since that would make files
+ * different that come from different revisions but store the same mesh.
+ *
  * \param [in] filename    Name of the file to write.
  * \param [in] p8est       Valid forest structure.
  * \param [in] save_data   If true, the element data is saved.
@@ -394,6 +414,8 @@ void                p8est_save (const char *filename, p8est_t * p8est,
  * that it was stored with.  The defaults can be changed with p8est_load_ext()
  * in p8est_extended.h.
  *
+ * The revision counter of the loaded p4est is set to zero.
+ *
  * \param [in] filename         Name of the file to read.
  * \param [in] mpicomm          A valid MPI communicator.
  * \param [in] data_size        Size of data for each quadrant which can be
diff --git a/src/p8est_algorithms.c b/src/p8est_algorithms.c
index 5d38a53..aec9b13 100644
--- a/src/p8est_algorithms.c
+++ b/src/p8est_algorithms.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_algorithms.h b/src/p8est_algorithms.h
index 80c50d1..ae1444d 100644
--- a/src/p8est_algorithms.h
+++ b/src/p8est_algorithms.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -111,7 +112,10 @@ int                 p8est_is_equal (p8est_t * p8est1, p8est_t * p8est2,
  *    the quadrant counters are consistent
  *    all trees are complete
  *    all non-local trees are empty
+ * This function is collective!
+ * It is also relatively expensive, so its use in production should be limited.
  * \param [in] p8est    The forest to be tested.
+ *                      Itself and its connectivity must be non-NULL.
  * \return              Returns true if valid, false otherwise.
  */
 int                 p8est_is_valid (p8est_t * p8est);
@@ -186,6 +190,9 @@ void                p8est_complete_region (p8est_t * p8est,
 
 /** Completes a sorted tree within a p8est. It may have exterior quadrants.
  * The completed tree will have only owned quadrants and no overlap.
+ * Note that the tree's counters (\a quadrants_per_level, \a maxlevel) must be
+ * correct for the quadrants in the incoming tree.
+ *
  * \param [in,out] p8est      The p8est to work on.
  * \param [in]     which_tree The 0-based index of the subtree to complete.
  * \param [in]     init_fn    Callback function to initialize the user_data
diff --git a/src/p8est_balance.c b/src/p8est_balance.c
index a789bc4..1e057be 100644
--- a/src/p8est_balance.c
+++ b/src/p8est_balance.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_balance.h b/src/p8est_balance.h
index 566de71..43bd570 100644
--- a/src/p8est_balance.h
+++ b/src/p8est_balance.h
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_bits.c b/src/p8est_bits.c
index 0ccc9d6..f277b29 100644
--- a/src/p8est_bits.c
+++ b/src/p8est_bits.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_bits.h b/src/p8est_bits.h
index 49f6088..d9f03ca 100644
--- a/src/p8est_bits.h
+++ b/src/p8est_bits.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -234,6 +235,15 @@ int                 p8est_quadrant_is_extended (const p8est_quadrant_t * q);
 int                 p8est_quadrant_is_sibling (const p8est_quadrant_t * q1,
                                                const p8est_quadrant_t * q2);
 
+/** Compute a specific child of a quadrant.
+ * \param [in]     q    Input quadrant.
+ * \param [in,out] r    Existing quadrant whose Morton index will be filled
+ *                      with the coordinates of its child no. \b child_id.
+ * \param [in] child_id The id of the child computed, 0..7.
+ */
+void                p8est_quadrant_child (const p8est_quadrant_t * q,
+                                          p8est_quadrant_t * r, int child_id);
+
 /** Test if two quadrants are siblings.
  * Descriptive, slower version of \a p8est_quadrant_is_sibling.
  * For debugging and educational purposes only.
diff --git a/src/p8est_communication.c b/src/p8est_communication.c
index 7522aa1..5a1bdcc 100644
--- a/src/p8est_communication.c
+++ b/src/p8est_communication.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_communication.h b/src/p8est_communication.h
index 6905d46..4cd0efa 100644
--- a/src/p8est_communication.h
+++ b/src/p8est_communication.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -28,34 +29,78 @@
 
 SC_EXTERN_C_BEGIN;
 
-typedef enum
-{
-  P8EST_COMM_COUNT_PERTREE = 1,
-  P8EST_COMM_BALANCE_FIRST_COUNT,
-  P8EST_COMM_BALANCE_FIRST_LOAD,
-  P8EST_COMM_BALANCE_SECOND_COUNT,
-  P8EST_COMM_BALANCE_SECOND_LOAD,
-  P8EST_COMM_PARTITION_GIVEN,
-  P8EST_COMM_PARTITION_WEIGHTED_LOW,
-  P8EST_COMM_PARTITION_WEIGHTED_HIGH,
-  P8EST_COMM_PARTITION_CORRECTION,
-  P8EST_COMM_GHOST_COUNT,
-  P8EST_COMM_GHOST_LOAD,
-  P8EST_COMM_GHOST_EXCHANGE,
-  P8EST_COMM_GHOST_EXPAND_COUNT,
-  P8EST_COMM_GHOST_EXPAND_LOAD,
-  P8EST_COMM_GHOST_SUPPORT_COUNT,
-  P8EST_COMM_GHOST_SUPPORT_LOAD,
-  P8EST_COMM_GHOST_CHECKSUM,
-  P8EST_COMM_NODES_QUERY,
-  P8EST_COMM_NODES_REPLY,
-  P8EST_COMM_SAVE,
-  P8EST_COMM_LNODES_TEST,
-  P8EST_COMM_LNODES_PASS,
-  P8EST_COMM_LNODES_OWNED,
-  P8EST_COMM_LNODES_ALL
-}
-p8est_comm_tag_t;
+/** Assign an MPI communicator to p8est; retrieve parallel environment.
+ *
+ * \param [in] mpicomm    A valid MPI communicator.
+ *
+ * \note The provided MPI communicator is not owned by p8est.
+ */
+void                p8est_comm_parallel_env_assign (p8est_t * p8est,
+                                                    sc_MPI_Comm mpicomm);
+
+/** Duplicate MPI communicator and replace the current one by the duplicate.
+ *
+ * \note The duplicated MPI communicator is owned by p8est.
+ */
+void                p8est_comm_parallel_env_duplicate (p8est_t * p8est);
+
+/** Release MPI communicator if it is owned by p8est.
+ */
+void                p8est_comm_parallel_env_release (p8est_t * p8est);
+
+/** Replace the current MPI communicator by the one provided as input.
+ *
+ * \param [in] mpicomm    A valid MPI communicator.
+ *
+ * \note The provided MPI communicator is not owned by p8est.
+ */
+void                p8est_comm_parallel_env_replace (p8est_t * p8est,
+                                                     sc_MPI_Comm mpicomm);
+
+/** Retrieve parallel environment information.
+ */
+void                p8est_comm_parallel_env_get_info (p8est_t * p8est);
+
+/** Check if the MPI communicator is valid.
+ *
+ * \return True if communicator is not NULL communicator, false otherwise.
+ */
+int                 p8est_comm_parallel_env_is_null (p8est_t * p8est);
+
+/** Reduce MPI communicator to non-empty ranks (i.e., nonzero quadrant counts).
+ *
+ * \param [in/out] p8est_supercomm  Object which communicator is reduced.
+ *                                  Points to NULL if this p8est does not
+ *                                  exists.
+ *
+ * \return True if p8est exists on this MPI rank after reduction.
+ */
+int                 p8est_comm_parallel_env_reduce (p8est_t **
+                                                    p8est_supercomm);
+
+/** Reduce MPI communicator to non-empty ranks and add a group of ranks that
+ * will remain in the reduced communicator regardless whether they are empty
+ * or not.
+ *
+ * \param [in/out] p8est_supercomm  Object which communicator is reduced.
+ *                                  Points to NULL if this p8est does not
+ *                                  exists.
+ * \param [in] group_add         Group of ranks that will remain in
+ *                               communicator.
+ * \param [in] add_to_beginning  If true, ranks will be added to the beginning
+ *                               of the reduced communicator, otherwise to the
+ *                               end.
+ * \param[out] ranks_subcomm     If not null, array of size 'subcommsize' with
+ *                               subcommrank->supercommrank map.
+ *
+ * \return True if p8est exists on this MPI rank after reduction.
+ */
+int                 p8est_comm_parallel_env_reduce_ext (p8est_t **
+                                                        p8est_supercomm,
+                                                        sc_MPI_Group
+                                                        group_add,
+                                                        int add_to_beginning,
+                                                        int **ranks_subcomm);
 
 /** Caculate the number and partition of quadrents.
  * \param [in,out] p8est  Adds all \c p8est->local_num_quadrant counters and
@@ -84,6 +129,13 @@ void                p8est_comm_global_partition (p8est_t * p8est,
 void                p8est_comm_count_pertree (p8est_t * p8est,
                                               p4est_gloidx_t * pertree);
 
+/** Query whether a processor has no quadrants.
+ * \param [in] p8est    This forests' global_first_position array must be valid.
+ * \param [in] p        Valid processor id.
+ * \return              True if and only if processor \p is empty.
+ */
+int                 p8est_comm_is_empty (p8est_t * p8est, int p);
+
 /** Tests ownershop of a quadrant via p8est->global_first_position.
  * Assumes a tree with no overlaps.
  * \param [in] rank    Rank whose ownership is tested.
diff --git a/src/p8est_connectivity.c b/src/p8est_connectivity.c
index edb9342..c09bf3b 100644
--- a/src/p8est_connectivity.c
+++ b/src/p8est_connectivity.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -62,6 +63,20 @@ const int           p8est_face_permutation_refs[6][6] =
  { 0, 2, 2, 0, 0, 1 },
  { 2, 0, 0, 2, 2, 0 }};
 
+const int           p8est_face_edge_permutations[8][4] =
+{{ 0, 1, 2, 3 },
+ { 0, 1, 3, 2 },
+ { 1, 0, 2, 3 },
+ { 1, 0, 3, 2 },
+ { 2, 3, 0, 1 },
+ { 2, 3, 1, 0 },
+ { 3, 2, 0, 1 },
+ { 3, 2, 1, 0 }};
+const int           p8est_face_edge_permutation_sets[3][4] =
+{{ 4, 1, 2, 7 },
+ { 0, 6, 5, 3 },
+ { 0, 5, 6, 3 }};
+
 const int           p8est_edge_faces[12][2] =
 {{ 2, 4 },
  { 3, 4 },
@@ -88,6 +103,19 @@ const int           p8est_edge_corners[12][2] =
  { 1, 5 },
  { 2, 6 },
  { 3, 7 }};
+const int           p8est_edge_edge_corners[12][8] =
+{{  0,  1, -1, -1, -1, -1, -1, -1},
+ { -1, -1,  0,  1, -1, -1, -1, -1},
+ { -1, -1, -1, -1,  0,  1, -1, -1},
+ { -1, -1, -1, -1, -1, -1,  0,  1},
+ {  0, -1,  1, -1, -1, -1, -1, -1},
+ { -1,  0, -1,  1, -1, -1, -1, -1},
+ { -1, -1, -1, -1,  0, -1,  1, -1},
+ { -1, -1, -1, -1, -1,  0, -1,  1},
+ {  0, -1, -1, -1,  1, -1, -1, -1},
+ { -1,  0, -1, -1, -1,  1, -1, -1},
+ { -1, -1,  0, -1, -1, -1,  1, -1},
+ { -1, -1, -1,  0, -1, -1, -1,  1}};
 const int           p8est_edge_face_corners[12][6][2] =
 {{{ -1, -1 }, { -1, -1 }, {  0,  1 }, { -1, -1 }, {  0,  1 }, { -1, -1 }},
  {{ -1, -1 }, { -1, -1 }, { -1, -1 }, {  0,  1 }, {  2,  3 }, { -1, -1 }},
@@ -101,6 +129,24 @@ const int           p8est_edge_face_corners[12][6][2] =
  {{ -1, -1 }, {  0,  2 }, {  1,  3 }, { -1, -1 }, { -1, -1 }, { -1, -1 }},
  {{  1,  3 }, { -1, -1 }, { -1, -1 }, {  0,  2 }, { -1, -1 }, { -1, -1 }},
  {{ -1, -1 }, {  1,  3 }, { -1, -1 }, {  1,  3 }, { -1, -1 }, { -1, -1 }}};
+const int           p8est_edge_face_edges[12][6] =
+{{ -1, -1,  0, -1,  0, -1 },
+ { -1, -1, -1,  0,  1, -1 },
+ { -1, -1,  1, -1, -1,  0 },
+ { -1, -1, -1,  1, -1,  1 },
+ {  0, -1, -1, -1,  2, -1 },
+ { -1,  0, -1, -1,  3, -1 },
+ {  1, -1, -1, -1, -1,  2 },
+ { -1,  1, -1, -1, -1,  3 },
+ {  2, -1,  2, -1, -1, -1 },
+ { -1,  2,  3, -1, -1, -1 },
+ {  3, -1, -1,  2, -1, -1 },
+ { -1,  3, -1,  3, -1, -1 }};
+const int           p8est_edge_corner_permutation_sets[2] =
+{0, 1};
+const int           p8est_edge_corner_permuations[2][2] =
+{{0, 1},
+ {1, 0}};
 
 const int           p8est_corner_faces[8][3] =
 {{ 0, 2, 4 },
@@ -129,7 +175,15 @@ const int           p8est_corner_face_corners[8][6] =
  { -1,  2,  3, -1, -1,  1 },
  {  3, -1, -1,  2, -1,  2 },
  { -1,  3, -1,  3, -1,  3 }};
-
+const int           p8est_corner_edge_corners[8][12] =
+{{  0, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1 },
+ {  1, -1, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1 },
+ { -1,  0, -1, -1,  1, -1, -1, -1, -1, -1,  0, -1 },
+ { -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  0 },
+ { -1, -1,  0, -1, -1, -1,  0, -1,  1, -1, -1, -1 },
+ { -1, -1,  1, -1, -1, -1, -1,  0, -1,  1, -1, -1 },
+ { -1, -1, -1,  0, -1, -1,  1, -1, -1, -1,  1, -1 },
+ { -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1 }};
 const int           p8est_child_edge_faces[8][12] =
 {{ -1,  4,  2, -1, -1,  4,  0, -1, -1,  2,  0, -1 },
  { -1,  4,  2, -1,  4, -1, -1,  1,  2, -1, -1,  1 },
@@ -874,47 +928,62 @@ p8est_find_edge_transform_internal (p4est_connectivity_t * conn,
                                     p8est_edge_info_t * ei,
                                     const p4est_topidx_t * ett,
                                     const int8_t * ete,
-                                    p4est_topidx_t edge_trees,
-                                    p4est_topidx_t ntrees[2])
+                                    p4est_topidx_t edge_trees)
 {
-  int                 i;
+  int                 i, j;
   int                 redge, nedge, iflip, nflip;
-  int                 pref, pset, fc[2];
-  int                 faces[2], nfaces[2], orients[2];
-#ifdef P4EST_ENABLE_DEBUG
-  int                 founds[2];
-#endif
-  int                 nows[2];
+  int                 pref, pset, fc[2], nc[2];
+  int                 face, nface, orient, eorient;
   p4est_topidx_t      etree, ietree, ntree;
   p8est_edge_transform_t *et;
   sc_array_t         *ta = &ei->edge_transforms;
-  const int           noncorners[2] = { -1, -1 };
-  const int          *fcorners[2] = { noncorners, noncorners };
-  const int          *nfcorners;
-  int                 flipped = 0;
+  const int          *fcorners;
+  int                 distinct = 1;
+  int                 edges[3], edgeorients[3];
+  p4est_topidx_t      etrees[3];
 
   P4EST_ASSERT (0 <= itree && itree < conn->num_trees);
   P4EST_ASSERT (0 <= iedge && iedge < P8EST_EDGES);
   P4EST_ASSERT (ta->elem_size == sizeof (p8est_edge_transform_t));
 
+  etrees[0] = itree;
+  edges[0] = iedge;
+  edgeorients[0] = 0;
+
   /* identify touching faces */
   for (i = 0; i < 2; ++i) {
-    faces[i] = p8est_edge_faces[iedge][i];
-    ntrees[i] = conn->tree_to_tree[P4EST_FACES * itree + faces[i]];
-    nfaces[i] = (int) conn->tree_to_face[P4EST_FACES * itree + faces[i]];
-    if (ntrees[i] == itree && nfaces[i] == faces[i]) {  /* domain boundary */
-      ntrees[i] = -1;
-      nfaces[i] = orients[i] = -1;
-    }
-    else {
-      orients[i] = nfaces[i] / P4EST_FACES;
-      nfaces[i] %= P4EST_FACES;
-      fcorners[i] = p8est_edge_face_corners[iedge][faces[i]];
-      P4EST_ASSERT (fcorners[i][0] >= 0 && fcorners[i][1] >= 0);
+    face = p8est_edge_faces[iedge][i];
+    ntree = conn->tree_to_tree[P4EST_FACES * itree + face];
+    nface = (int) conn->tree_to_face[P4EST_FACES * itree + face];
+    if (ntree != itree || nface != face) {      /* not domain boundary */
+      orient = nface / P4EST_FACES;
+      nface %= P4EST_FACES;
+      fcorners = &(p8est_edge_face_corners[iedge][face][0]);
+      P4EST_ASSERT (fcorners[0] >= 0 && fcorners[1] >= 0);
+      pref = p8est_face_permutation_refs[face][nface];
+      pset = p8est_face_permutation_sets[pref][orient];
+      fc[0] = p8est_face_permutations[pset][fcorners[0]];
+      fc[1] = p8est_face_permutations[pset][fcorners[1]];
+
+      /* if this is a new edge, add it */
+      nc[0] = p8est_face_corners[nface][fc[0]];
+      nc[1] = p8est_face_corners[nface][fc[1]];
+      nedge = p8est_child_corner_edges[nc[0]][nc[1]];
+      P4EST_ASSERT (nedge >= 0);
+      eorient = (p8est_edge_corners[nedge][1] == nc[0]);
+      for (j = 0; j < distinct; j++) {
+        if (ntree == etrees[j] &&
+            nedge == edges[j] && eorient == edgeorients[j]) {
+          break;
+        }
+      }
+      if (j == distinct) {
+        etrees[j] = ntree;
+        edges[j] = nedge;
+        edgeorients[j] = eorient;
+        distinct++;
+      }
     }
-#ifdef P4EST_ENABLE_DEBUG
-    founds[i] = 0;
-#endif
   }
 
   /* find orientation of this edge */
@@ -946,6 +1015,16 @@ p8est_find_edge_transform_internal (p4est_connectivity_t * conn,
     nedge = redge % P8EST_EDGES;
     nflip = (redge / P8EST_EDGES) ^ iflip;
 
+    for (j = 0; j < distinct; j++) {
+      if (ntree == etrees[j] && nedge == edges[j] && nflip == edgeorients[j]) {
+        break;
+      }
+    }
+    if (j < distinct) {
+      /* already found from self or faces */
+      continue;
+    }
+#if 0
     nows[0] = nows[1] = 0;
     for (i = 0; i < 2; ++i) {
       if (ntree == ntrees[i]) {
@@ -974,6 +1053,7 @@ p8est_find_edge_transform_internal (p4est_connectivity_t * conn,
     if (nows[0] || nows[1]) {
       continue;
     }
+#endif
 
     /* else we have a diagonal edge with ntree */
     et = (p8est_edge_transform_t *) sc_array_push (ta);
@@ -986,18 +1066,63 @@ p8est_find_edge_transform_internal (p4est_connectivity_t * conn,
     et->corners = (int8_t) (nedge % 4);
   }
 
-  return flipped;
+  return distinct;
 }
 
 #include "p4est_connectivity.c"
 
+int
+p8est_connectivity_face_neighbor_edge_orientation (int e, int f,
+                                                   int nf, int o)
+{
+  int                 fe, nfe, pref, pset;
+
+  P4EST_ASSERT (0 <= e && e < P8EST_EDGES);
+  P4EST_ASSERT (0 <= f && f < P4EST_FACES);
+  P4EST_ASSERT (0 <= nf && nf < P4EST_FACES);
+  P4EST_ASSERT (0 <= o && o < P4EST_HALF);
+
+  fe = p8est_edge_face_edges[e][f];
+  P4EST_ASSERT (0 <= fe && fe < P4EST_HALF);
+
+  pref = p8est_face_permutation_refs[f][nf];
+  pset = p8est_face_edge_permutation_sets[pref][o];
+  nfe = p8est_face_edge_permutations[pset][fe];
+
+  P4EST_ASSERT (0 <= nfe && nfe < P4EST_HALF);
+
+  return p8est_face_edges[nf][nfe];
+}
+
+int
+p8est_connectivity_edge_neighbor_corner_orientation (int c, int e,
+                                                     int ne, int o)
+{
+  int                 ec, nec, pset;
+
+  P4EST_ASSERT (0 <= c && e < P4EST_CHILDREN);
+  P4EST_ASSERT (0 <= e && e < P8EST_EDGES);
+  P4EST_ASSERT (0 <= ne && ne < P8EST_EDGES);
+  P4EST_ASSERT (0 <= o && o < 2);
+
+  ec = p8est_edge_edge_corners[e][c];
+  P4EST_ASSERT (0 <= ec && ec < 2);
+
+  pset = p8est_edge_corner_permutation_sets[o];
+  nec = p8est_edge_corner_permuations[pset][ec];
+
+  P4EST_ASSERT (0 <= nec && nec < 2);
+
+  return p8est_edge_corners[ne][nec];
+}
+
 void
 p8est_find_edge_transform (p4est_connectivity_t * conn,
                            p4est_topidx_t itree, int iedge,
                            p8est_edge_info_t * ei)
 {
-  int                 flipped;
-  p4est_topidx_t      ntrees[2], edge_trees, aedge, ettae;
+  int                 distinct;
+  p4est_topidx_t      edge_trees, aedge, ettae;
   sc_array_t         *ta = &ei->edge_transforms;
 
   P4EST_ASSERT (0 <= itree && itree < conn->num_trees);
@@ -1022,12 +1147,11 @@ p8est_find_edge_transform (p4est_connectivity_t * conn,
   P4EST_ASSERT (0 <= ettae && 1 <= edge_trees);
 
   /* loop through all edge neighbors and find edge connections */
-  flipped = p8est_find_edge_transform_internal (conn, itree, iedge, ei,
-                                                conn->edge_to_tree + ettae,
-                                                conn->edge_to_edge + ettae,
-                                                edge_trees, ntrees);
-  P4EST_ASSERT (edge_trees == (p4est_topidx_t) ta->elem_count
-                + 1 + (ntrees[0] != -1) + (ntrees[1] != -1) - flipped);
+  distinct = p8est_find_edge_transform_internal (conn, itree, iedge, ei,
+                                                 conn->edge_to_tree + ettae,
+                                                 conn->edge_to_edge + ettae,
+                                                 edge_trees);
+  P4EST_ASSERT (edge_trees == (p4est_topidx_t) ta->elem_count + distinct);
 }
 
 int
diff --git a/src/p8est_connectivity.h b/src/p8est_connectivity.h
index f258f1d..0506af8 100644
--- a/src/p8est_connectivity.h
+++ b/src/p8est_connectivity.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -134,23 +135,27 @@ const char         *p8est_connect_type_string (p8est_connect_type_t btype);
  * [0][0]..[0][2]..[num_vertices-1][0]..[num_vertices-1][2].
  *
  * The edges are only stored when they connect trees.
+ * In this case tree_to_edge indexes into \a ett_offset.
  * Otherwise the tree_to_edge entry must be -1 and this edge is ignored.
  * If num_edges == 0, tree_to_edge and edge_to_* arrays are set to NULL.
  *
  * The arrays edge_to_* store a variable number of entries per edge.
  * For edge e these are at position [ett_offset[e]]..[ett_offset[e+1]-1].
  * Their number for edge e is ett_offset[e+1] - ett_offset[e].
+ * The entries encode all trees adjacent to edge e.
  * The size of the edge_to_* arrays is num_ett = ett_offset[num_edges].
  * The edge_to_edge array holds values in 0..23, where the lower 12 indicate
  * one edge orientation and the higher 12 the opposite edge orientation.
  *
  * The corners are only stored when they connect trees.
+ * In this case tree_to_corner indexes into \a ctt_offset.
  * Otherwise the tree_to_corner entry must be -1 and this corner is ignored.
  * If num_corners == 0, tree_to_corner and corner_to_* arrays are set to NULL.
  *
  * The arrays corner_to_* store a variable number of entries per corner.
  * For corner c these are at position [ctt_offset[c]]..[ctt_offset[c+1]-1].
  * Their number for corner c is ctt_offset[c+1] - ctt_offset[c].
+ * The entries encode all trees adjacent to corner c.
  * The size of the corner_to_* arrays is num_ctt = ctt_offset[num_corners].
  *
  * The *_to_attr arrays may have arbitrary contents defined by the user.
@@ -177,7 +182,7 @@ typedef struct p8est_connectivity
 
   p4est_topidx_t     *tree_to_tree; /**< (6 * \a num_trees) neighbors across
                                          faces */
-  int8_t             *tree_to_face; /**< (4 * \a num_trees) face to
+  int8_t             *tree_to_face; /**< (6 * \a num_trees) face to
                                          face+orientation (see description) */
   p4est_topidx_t     *tree_to_edge; /**< (12 * \a num_trees) or NULL (see
                                           description) */
@@ -234,12 +239,14 @@ p8est_corner_info_t;
 /** Store the corner numbers 0..7 for each tree face. */
 extern const int    p8est_face_corners[6][4];
 
-/** Store the face numbers 0..12 for each tree face. */
+/** Store the edge numbers 0..12 for each tree face. */
 extern const int    p8est_face_edges[6][4];
 
 /** Store the face numbers in the face neighbor's system. */
 extern const int    p8est_face_dual[6];
 
+/* face corners */
+
 /** Store only the 8 out of 24 possible permutations that occur. */
 extern const int    p8est_face_permutations[8][4];
 
@@ -247,19 +254,38 @@ extern const int    p8est_face_permutations[8][4];
 extern const int    p8est_face_permutation_sets[3][4];
 
 /** For each face combination store the permutation set.
- * The order is [my_face][neighbor_face]
- */
+ * The order is [my_face][neighbor_face] */
 extern const int    p8est_face_permutation_refs[6][6];
 
+/* face edges */
+
+/** Store only the 8 out of 24 possible permutations that occur. */
+extern const int    p8est_face_edge_permutations[8][4];
+
+/** Store the 3 occurring sets of 4 permutations per face. */
+extern const int    p8est_face_edge_permutation_sets[3][4];
+
 /** Store the face numbers 0..5 for each tree edge. */
 extern const int    p8est_edge_faces[12][2];
 
 /** Store the corner numbers 0..8 for each tree edge. */
 extern const int    p8est_edge_corners[12][2];
 
-/** Store the face corner numbers for the faces touching a tree edge. */
+/** Store the edge corner numbers 0..1 for the corners touching a tree edge */
+extern const int    p8est_edge_edge_corners[12][8];
+
+/** Store the face corner numbers 0..3 for the faces touching a tree edge. */
 extern const int    p8est_edge_face_corners[12][6][2];
 
+/** Store the face edge numbers 0..3 for the faces touching a tree edge. */
+extern const int    p8est_edge_face_edges[12][6];
+
+/** Store the sets of permutations that occur */
+extern const int    p8est_edge_corner_permutation_sets[2];
+
+/** Store the two possible permutations that occur */
+extern const int    p8est_edge_corner_permutations[2][2];
+
 /** Store the face numbers 0..5 for each tree corner. */
 extern const int    p8est_corner_faces[8][3];
 
@@ -269,6 +295,9 @@ extern const int    p8est_corner_edges[8][3];
 /** Store the face corner numbers for the faces touching a tree corner. */
 extern const int    p8est_corner_face_corners[8][6];
 
+/** Store the edge corner numbers for the edges touching a tree corner. */
+extern const int    p8est_corner_edge_corners[8][12];
+
 /** Store the faces for each child and edge, can be -1. */
 extern const int    p8est_child_edge_faces[8][12];
 
@@ -302,6 +331,26 @@ int                 p8est_connectivity_face_neighbor_corner_set
 int                 p8est_connectivity_face_neighbor_corner_orientation
   (int c, int f, int nf, int o);
 
+/** Transform an edge across one of the adjacent faces into a neighbor tree.
+ * This version expects the neighbor face and orientation separately.
+ * \param [in] e    A edge number in 0..11.
+ * \param [in] f    A face number that touches the edge \a e.
+ * \param [in] nf   A neighbor face that is on the other side of \f.
+ * \param [in] o    The orientation between tree boundary faces \a f and \nf.
+ */
+int                 p8est_connectivity_face_neighbor_edge_orientation
+  (int e, int f, int nf, int o);
+
+/** Transform a corner across one of the adjacent edges into a neighbor tree.
+ * This version expects the neighbor edge and orientation separately.
+ * \param [in] c    A corner number in 0..7.
+ * \param [in] e    A edge number that touches the corner \a c.
+ * \param [in] ne   A neighbor edge that is on the other side of \e.
+ * \param [in] o    The orientation between tree boundary faces \a e and \ne.
+ */
+int                 p8est_connectivity_edge_neighbor_corner_orientation
+  (int c, int e, int ne, int o);
+
 /** Allocate a connectivity structure.
  * The attribute fields are initialized to NULL.
  * \param [in] num_vertices   Number of total vertices (i.e. geometric points).
@@ -326,7 +375,11 @@ p8est_connectivity_t *p8est_connectivity_new (p4est_topidx_t num_vertices,
  * \param [in] num_edges      Number of tree-connecting edges.
  * \param [in] num_corners    Number of tree-connecting corners.
  * \param [in] eoff           Edge-to-tree offsets (num_edges + 1 values).
+ *                            This must always be non-NULL; in trivial cases
+ *                            it is just a pointer to a p4est_topix value of 0.
  * \param [in] coff           Corner-to-tree offsets (num_corners + 1 values).
+ *                            This must always be non-NULL; in trivial cases
+ *                            it is just a pointer to a p4est_topix value of 0.
  * \return                    The connectivity is checked for validity.
  */
 p8est_connectivity_t *p8est_connectivity_new_copy (p4est_topidx_t
@@ -349,6 +402,21 @@ p8est_connectivity_t *p8est_connectivity_new_copy (p4est_topidx_t
                                                    const p4est_topidx_t * ctt,
                                                    const int8_t * ctc);
 
+/** Broadcast a connectivity structure that exists only on one process to all.
+ *  On the other processors, it will be allocated using p8est_connectivity_new.
+ *  \param [in] conn_in For the root process the connectivity to be broadcast,
+ *                      for the other processes it must be NULL.
+ *  \param [in] root    The rank of the process that provides the connectivity.
+ *  \param [in] comm    The MPI communicator.
+ *  \return             For the root process this is a pointer to \a conn_in.
+ *                      Else, a pointer to a newly allocated connectivity
+ *                      structure with the same values as \a conn_in on the
+ *                      root process.
+ */
+p8est_connectivity_t *p8est_connectivity_bcast (p8est_connectivity_t *
+                                                conn_in, int root,
+                                                sc_MPI_Comm comm);
+
 /** Destroy a connectivity structure.  Also destroy all attributes.
  */
 void                p8est_connectivity_destroy (p8est_connectivity_t *
@@ -440,6 +508,16 @@ p8est_connectivity_t *p8est_connectivity_new_rotwrap (void);
  */
 p8est_connectivity_t *p8est_connectivity_new_twocubes (void);
 
+/** Create a connectivity structure for two trees being rotated
+ * w.r.t. each other in a user-defined way.
+ * \param[in] l_face      index of left face
+ * \param[in] r_face      index of right face
+ * \param[in] orientation orientation of trees w.r.t. each other
+ */
+p8est_connectivity_t *p8est_connectivity_new_twotrees (int l_face,
+                                                       int r_face,
+                                                       int orientation);
+
 /** Create a connectivity structure that contains two cubes
  * where the two far ends are identified periodically.
  */
@@ -487,6 +565,18 @@ p8est_connectivity_t *p8est_connectivity_new_sphere (void);
  */
 p8est_connectivity_t *p8est_connectivity_new_byname (const char *name);
 
+/** Uniformly refine a connectivity.
+ * This is useful if you would like to uniformly refine by something other
+ * than a power of 2.
+ *
+ * \param [in] conn         a valid connectivity
+ * \param [in] num_per_edge the number of new trees in each direction
+ *
+ * \return a refined connectivity.
+ */
+p8est_connectivity_t *p8est_connectivity_refine (p8est_connectivity_t * conn,
+                                                 int num_per_edge);
+
 /** Fill an array with the axis combination of a face neighbor transform.
  * \param [in]  iface       The number of the originating face.
  * \param [in]  nface       Encoded as nface = r * 6 + nf, where nf = 0..5 is
@@ -756,7 +846,7 @@ p8est_corner_array_index (sc_array_t * array, size_t it)
  * \endcode
  *
  * This code can be called two ways.  The first, when \c vertex==NULL and \c
- * tree_to_vertex==NULL, is used to count the number of tress and vertices in
+ * tree_to_vertex==NULL, is used to count the number of trees and vertices in
  * the connectivity to be generated by the \c .inp mesh in the \a stream.  The
  * second, when \c vertices!=NULL and \c tree_to_vertex!=NULL, fill \c vertices
  * and \c tree_to_vertex.  In this case \c num_vertices and \c num_trees need
diff --git a/example/timings/bricks3.c b/src/p8est_connrefine.c
similarity index 92%
copy from example/timings/bricks3.c
copy to src/p8est_connrefine.c
index 9e3e1e5..98c6521 100644
--- a/example/timings/bricks3.c
+++ b/src/p8est_connrefine.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -22,4 +23,4 @@
 */
 
 #include <p4est_to_p8est.h>
-#include "bricks2.c"
+#include "p4est_connrefine.c"
diff --git a/src/p8est_extended.h b/src/p8est_extended.h
index 610afce..6a1e1a7 100644
--- a/src/p8est_extended.h
+++ b/src/p8est_extended.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -42,6 +43,7 @@
 #include <p8est.h>
 #include <p8est_mesh.h>
 #include <p8est_iterate.h>
+#include <p8est_lnodes.h>
 
 SC_EXTERN_C_BEGIN;
 
@@ -146,6 +148,23 @@ p8est_mesh_t       *p8est_mesh_new_ext (p8est_t * p4est,
                                         int compute_level_lists,
                                         p8est_connect_type_t btype);
 
+/** Make a deep copy of a p8est.
+ * The connectivity is not duplicated.
+ * Copying of quadrant user data is optional.
+ * If old and new data sizes are 0, the user_data field is copied regardless.
+ * The inspect member of the copy is set to NULL.
+ * The revision counter of the copy is set to zero.
+ *
+ * \param [in]  copy_data  If true, data are copied.
+ *                         If false, data_size is set to 0.
+ * \param [in]  duplicate_mpicomm  If true, MPI communicator is copied.
+ * \return  Returns a valid p8est that does not depend on the input,
+ *                         except for borrowing the same connectivity.
+ *                         Its revision counter is 0.
+ */
+p8est_t            *p8est_copy_ext (p8est_t * input, int copy_data,
+                                    int duplicate_mpicomm);
+
 /** Refine a forest with a bounded refinement level and a replace option.
  * \param [in,out] p8est The forest is changed in place.
  * \param [in] refine_recursive Boolean to decide on recursive refinement.
@@ -235,6 +254,16 @@ p4est_gloidx_t      p8est_partition_ext (p8est_t * p8est,
                                          int partition_for_coarsening,
                                          p8est_weight_t weight_fn);
 
+/** Correct partition to allow one level of coarsening.
+ *
+ * \param [in] p8est                     forest whose partition is corrected
+ * \param [in,out] num_quadrants_in_proc partition that will be corrected
+ * \return                               absolute number of moved quadrants
+ */
+p4est_gloidx_t      p8est_partition_for_coarsening (p8est_t * p8est,
+                                                    p4est_locidx_t *
+                                                    num_quadrants_in_proc);
+
 /** p8est_iterate_ext adds the option \a remote: if this is false, then it is
  * the same as p8est_iterate; if this is true, then corner/edge callbacks are
  * also called on corners/edges for hanging faces/edges touched by local
@@ -305,6 +334,68 @@ p8est_t            *p8est_source_ext (sc_io_source_t * src,
                                       int broadcasthead, void *user_pointer,
                                       p8est_connectivity_t ** connectivity);
 
+/** Create the data necessary to create a PETsc DMPLEX representation of a
+ * forest, as well as the accompanying lnodes and ghost layer.  The forest
+ * must be at least face balanced (see p4est_balance()).  See
+ * test/test_plex2.c for example usage.
+ *
+ * All arrays should be initialized to hold sizeof (p4est_locidx_t), except
+ * for \a out_remotes, which should be initialized to hold
+ * (2 * sizeof (p4est_locidx_t)).
+ *
+ * \param[in]     p8est                 the forest
+ * \param[out]    ghost                 the ghost layer
+ * \param[out]    lnodes                the lnodes
+ * \param[in]     ctype                 the type of adjacency for the overlap
+ * \param[in]     overlap               the number of layers of overlap (zero
+ *                                      is acceptable)
+ * \param[out]    first_local_quad      the local quadrants are assigned
+ *                                      contiguous plex indices, starting with
+ *                                      this index
+ * \param[in,out] out_points_per_dim    filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_cone_sizes        filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_cones             filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_cone_orientations filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_vertex_coords     filled with argument for
+ *                                      DMPlexCreateFromDAG()
+ * \param[in,out] out_children          filled with argument for
+ *                                      DMPlexSetTree()
+ * \param[in,out] out_parents           filled with argument for
+ *                                      DMPlexSetTree()
+ * \param[in,out] out_childids          filled with argument for
+ *                                      DMPlexSetTree()
+ * \param[in,out] out_leaves            filled with argument for
+ *                                      PetscSFSetGraph()
+ * \param[in,out] out_remotes           filled with argument for
+ *                                      PetscSFSetGraph()
+ * \param[in]     custom_numbering      Whether or use the default numbering
+ *                                      (0) of DMPlex child ids or the custom
+ *                                      (1).
+ */
+void                p8est_get_plex_data_ext (p8est_t * p8est,
+                                             p8est_ghost_t ** ghost,
+                                             p8est_lnodes_t ** lnodes,
+                                             p8est_connect_type_t ctype,
+                                             int overlap,
+                                             p4est_locidx_t *
+                                             first_local_quad,
+                                             sc_array_t * out_points_per_dim,
+                                             sc_array_t * out_cone_sizes,
+                                             sc_array_t * out_cones,
+                                             sc_array_t *
+                                             out_cone_orientations,
+                                             sc_array_t * out_vertex_coords,
+                                             sc_array_t * out_children,
+                                             sc_array_t * out_parents,
+                                             sc_array_t * out_childids,
+                                             sc_array_t * out_leaves,
+                                             sc_array_t * out_remotes,
+                                             int custom_numbering);
+
 SC_EXTERN_C_END;
 
 #endif /* !P8EST_EXTENDED_H */
diff --git a/src/p8est_geometry.c b/src/p8est_geometry.c
index 0f797f1..6d98c5e 100644
--- a/src/p8est_geometry.c
+++ b/src/p8est_geometry.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_geometry.h b/src/p8est_geometry.h
index 1518c59..3c664c2 100644
--- a/src/p8est_geometry.h
+++ b/src/p8est_geometry.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,7 +22,7 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
-/** \file p8est_geometry.h transforms from vertex frame to physical space
+/** \file p8est_geometry.h transforms from vertex frame to physical space.
  *
  * \ingroup p8est
  */
@@ -33,6 +34,7 @@
 
 SC_EXTERN_C_BEGIN;
 
+/** This object encapsulates a custom geometry transformation. */
 typedef struct p8est_geometry p8est_geometry_t;
 
 /** Forward transformation from the reference unit square to physical space.
diff --git a/src/p8est_ghost.c b/src/p8est_ghost.c
index c7c4ae3..58e3a5c 100644
--- a/src/p8est_ghost.c
+++ b/src/p8est_ghost.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_ghost.h b/src/p8est_ghost.h
index 67fe21f..eb2a362 100644
--- a/src/p8est_ghost.h
+++ b/src/p8est_ghost.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -88,8 +89,8 @@ p8est_ghost_t;
  * \param [in] ghost    Ghost layer structure.
  * \return true if \a ghost is valid
  */
-int                 p8est_ghost_is_valid (p8est_t * p8est, p8est_ghost_t *ghost);
-
+int                 p8est_ghost_is_valid (p8est_t * p8est,
+                                          p8est_ghost_t * ghost);
 
 /** Calculate the memory usage of the ghost layer.
  * \param [in] ghost    Ghost layer structure.
@@ -250,6 +251,41 @@ void                p8est_ghost_exchange_data (p8est_t * p4est,
                                                p8est_ghost_t * ghost,
                                                void *ghost_data);
 
+/** Transient storage for asynchronous ghost exchange. */
+typedef struct p8est_ghost_exchange
+{
+  int                 is_custom;        /**< False for p4est_ghost_exchange_data */
+  int                 is_levels;        /**< Are we restricted to levels or not */
+  p8est_t            *p4est;
+  p8est_ghost_t      *ghost;
+  int                 minlevel, maxlevel;       /**< Meaningful with is_levels */
+  size_t              data_size;
+  void               *ghost_data;
+  int                *qactive, *qbuffer;
+  sc_array_t          requests, sbuffers;
+  sc_array_t          rrequests, rbuffers;
+}
+p8est_ghost_exchange_t;
+
+/** Begin an asynchronous ghost data exchange by posting messages.
+ * The arguments are identical to p8est_ghost_exchange_data.
+ * The return type is always non-NULL and must be passed to
+ * p8est_ghost_exchange_data_end to complete the exchange.
+ * The ghost data must not be accessed before completion.
+ * \param [in,out]  ghost_data  Must stay alive into the completion call.
+ * \return          Transient storage for messages in progress.
+ */
+p8est_ghost_exchange_t *p8est_ghost_exchange_data_begin
+  (p8est_t * p4est, p8est_ghost_t * ghost, void *ghost_data);
+
+/** Complete an asynchronous ghost data exchange.
+ * This function waits for all pending MPI communications.
+ * \param [in,out]  Data created ONLY by p8est_ghost_exchange_data_begin.
+ *                  It is deallocated before this function returns.
+ */
+void                p8est_ghost_exchange_data_end
+  (p8est_ghost_exchange_t * exc);
+
 /** Transfer data for local quadrants that are ghosts to other processors.
  * The data size is the same for all quadrants and can be chosen arbitrarily.
  * \param [in] p8est            The forest used for reference.
@@ -266,6 +302,29 @@ void                p8est_ghost_exchange_custom (p8est_t * p4est,
                                                  void **mirror_data,
                                                  void *ghost_data);
 
+/** Begin an asynchronous ghost data exchange by posting messages.
+ * The arguments are identical to p8est_ghost_exchange_custom.
+ * The return type is always non-NULL and must be passed to
+ * p8est_ghost_exchange_custom_end to complete the exchange.
+ * The ghost data must not be accessed before completion.
+ * The mirror data can be safely discarded right after this function returns
+ * since it is copied into internal send buffers.
+ * \param [in]      mirror_data Not required to stay alive any longer.
+ * \param [in,out]  ghost_data  Must stay alive into the completion call.
+ * \return          Transient storage for messages in progress.
+ */
+p8est_ghost_exchange_t *p8est_ghost_exchange_custom_begin
+  (p8est_t * p4est, p8est_ghost_t * ghost,
+   size_t data_size, void **mirror_data, void *ghost_data);
+
+/** Complete an asynchronous ghost data exchange.
+ * This function waits for all pending MPI communications.
+ * \param [in,out]  Data created ONLY by p8est_ghost_exchange_custom_begin.
+ *                  It is deallocated before this function returns.
+ */
+void                p8est_ghost_exchange_custom_end
+  (p8est_ghost_exchange_t * exc);
+
 /** Transfer data for local quadrants that are ghosts to other processors.
  * The data size is the same for all quadrants and can be chosen arbitrarily.
  * This function restricts the transfer to a range of refinement levels.
@@ -275,7 +334,7 @@ void                p8est_ghost_exchange_custom (p8est_t * p4est,
  * \param [in] minlevel         Level of the largest quads to be exchanged.
  *                              Use <= 0 for no restriction.
  * \param [in] maxlevel         Level of the smallest quads to be exchanged.
- *                              Use >= P4EST_QMAXLEVEL for no restriction.
+ *                              Use >= P8EST_QMAXLEVEL for no restriction.
  * \param [in] data_size        The data size to transfer per quadrant.
  * \param [in] mirror_data      One data pointer per mirror quadrant as input. 
  * \param [in,out] ghost_data   Pre-allocated contiguous data for all ghosts
@@ -290,6 +349,29 @@ void                p8est_ghost_exchange_custom_levels (p8est_t * p8est,
                                                         void **mirror_data,
                                                         void *ghost_data);
 
+/** Begin an asynchronous ghost data exchange by posting messages.
+ * The arguments are identical to p8est_ghost_exchange_custom_levels.
+ * The return type is always non-NULL and must be passed to
+ * p8est_ghost_exchange_custom_levels_end to complete the exchange.
+ * The ghost data must not be accessed before completion.
+ * The mirror data can be safely discarded right after this function returns
+ * since it is copied into internal send buffers.
+ * \param [in]      mirror_data Not required to stay alive any longer.
+ * \param [in,out]  ghost_data  Must stay alive into the completion call.
+ * \return          Transient storage for messages in progress.
+ */
+p8est_ghost_exchange_t *p8est_ghost_exchange_custom_levels_begin
+  (p8est_t * p4est, p8est_ghost_t * ghost, int minlevel, int maxlevel,
+   size_t data_size, void **mirror_data, void *ghost_data);
+
+/** Complete an asynchronous ghost data exchange.
+ * This function waits for all pending MPI communications.
+ * \param [in,out]  Data created ONLY by p8est_ghost_exchange_custom_levels_begin.
+ *                  It is deallocated before this function returns.
+ */
+void                p8est_ghost_exchange_custom_levels_end
+  (p8est_ghost_exchange_t * exc);
+
 /** Expand the size of the ghost layer and mirrors by one additional layer of
  * adjacency.
  * \param [in] p8est            The forest from which the ghost layer was
diff --git a/src/p8est_io.c b/src/p8est_io.c
index c294560..8c82fa0 100644
--- a/src/p8est_io.c
+++ b/src/p8est_io.c
@@ -4,7 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
-  Copyright (C) 2012 Carsten Burstedde
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_io.h b/src/p8est_io.h
index e21ad16..e478c32 100644
--- a/src/p8est_io.h
+++ b/src/p8est_io.h
@@ -4,7 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
-  Copyright (C) 2012 Carsten Burstedde
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -41,6 +41,7 @@ sc_array_t         *p8est_deflate_quadrants (p8est_t * p8est,
                                              sc_array_t ** data);
 
 /** Create a new p4est based on serialized data.
+ * Its revision counter is set to zero.
  * See p8est.h and p8est_communication.h for more information on parameters.
  * \param [in] mpicomm       A valid MPI communicator.
  * \param [in] connectivity  This is the connectivity information that
@@ -55,7 +56,7 @@ sc_array_t         *p8est_deflate_quadrants (p8est_t * p8est,
  *                           The elem_size of this array informs data_size.
  *                           Its elem_count equals the number of local quads.
  * \param [in] user_pointer  Assign to the user_pointer member of the p4est.
- * \return              The newly created p4est.
+ * \return              The newly created p4est with a zero revision counter.
  */
 p8est_t            *p8est_inflate (sc_MPI_Comm mpicomm,
                                    p8est_connectivity_t * connectivity,
diff --git a/src/p8est_iterate.c b/src/p8est_iterate.c
index 5623787..28eb05e 100644
--- a/src/p8est_iterate.c
+++ b/src/p8est_iterate.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_iterate.h b/src/p8est_iterate.h
index ae0b888..d39d8ad 100644
--- a/src/p8est_iterate.h
+++ b/src/p8est_iterate.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -330,7 +331,7 @@ p8est_iter_cside_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p8est_iter_corner_side_t *)
-    (array->array + sizeof (p8est_iter_corner_side_t) * it);
+    (array->array + sizeof (p8est_iter_corner_side_t) * (size_t) it);
 }
 
 /** Return a pointer to a iter_corner_side array element indexed by a size_t.
@@ -356,7 +357,7 @@ p8est_iter_eside_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p8est_iter_edge_side_t *)
-    (array->array + sizeof (p8est_iter_edge_side_t) * it);
+    (array->array + sizeof (p8est_iter_edge_side_t) * (size_t) it);
 }
 
 /** Return a pointer to a iter_edge_side array element indexed by a size_t.
@@ -382,7 +383,7 @@ p8est_iter_fside_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p8est_iter_face_side_t *)
-    (array->array + sizeof (p8est_iter_face_side_t) * it);
+    (array->array + sizeof (p8est_iter_face_side_t) * (size_t) it);
 }
 
 /** Return a pointer to a iter_face_side array element indexed by a size_t.
diff --git a/src/p8est_lnodes.c b/src/p8est_lnodes.c
index 3c7d950..4509882 100644
--- a/src/p8est_lnodes.c
+++ b/src/p8est_lnodes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_lnodes.h b/src/p8est_lnodes.h
index 72eb592..8208dd4 100644
--- a/src/p8est_lnodes.h
+++ b/src/p8est_lnodes.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -248,7 +249,33 @@ p8est_lnodes_t     *p8est_lnodes_new (p8est_t * p8est,
                                       p8est_ghost_t * ghost_layer,
                                       int degree);
 
-void                p8est_lnodes_destroy (p8est_lnodes_t *);
+void                p8est_lnodes_destroy (p8est_lnodes_t * lnodes);
+
+/** Partition using weights based on the number of nodes assigned to each
+ * element in lnodes
+ *
+ * \param[in,out] p8est                    the forest to be repartitioned
+ * \param[in]     ghost                    the ghost layer
+ * \param[in]     degree                   the degree that would be passed to p8est_lnodes_new()
+ * \param[in]     partition_for_coarsening whether the partition should allow
+ *                                         coarsening (i.e. group siblings who
+ *                                         might merge)
+ */
+void                p8est_partition_lnodes (p8est_t * p8est,
+                                            p8est_ghost_t * ghost, int degree,
+                                            int partition_for_coarsening);
+
+/** Partition using weights that are broken down by where they reside: in
+ * volumes, on faces, on edges, or on corners.
+ */
+void                p8est_partition_lnodes_detailed (p8est_t * p4est,
+                                                     p8est_ghost_t * ghost,
+                                                     int nodes_per_volume,
+                                                     int nodes_per_face,
+                                                     int nodes_per_edge,
+                                                     int nodes_per_corner,
+                                                     int
+                                                     partition_for_coarsening);
 
 /** Expand the ghost layer to include the support of all nodes supported on
  * the local partition.
@@ -371,7 +398,7 @@ p8est_lnodes_rank_array_index_int (sc_array_t * array, int it)
   P4EST_ASSERT (it >= 0 && (size_t) it < array->elem_count);
 
   return (p8est_lnodes_rank_t *)
-    (array->array + sizeof (p8est_lnodes_rank_t) * it);
+    (array->array + sizeof (p8est_lnodes_rank_t) * (size_t) it);
 }
 
 /** Return a pointer to a lnodes_rank array element indexed by a size_t.
diff --git a/src/p8est_mesh.c b/src/p8est_mesh.c
index 92be501..ce92d29 100644
--- a/src/p8est_mesh.c
+++ b/src/p8est_mesh.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_mesh.h b/src/p8est_mesh.h
index 16ab7d4..d3898f1 100644
--- a/src/p8est_mesh.h
+++ b/src/p8est_mesh.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -82,8 +83,8 @@ SC_EXTERN_C_BEGIN;
  * Each group contains the quadrant numbers encoded as usual for quad_to_quad
  * in corner_quad, and the corner number from the neighbor as corner_corner.
  *
- * Intra-tree corners and corners across an inter-tree face are implemented.
- * Other inter-tree corners are NOT IMPLEMENTED and are assigned the value -2.
+ * Intra-tree corners and inter-tree face and corner corners are implemented.
+ * Edge inter-tree corners are NOT IMPLEMENTED and are assigned the value -2.
  * Corners with no diagonal neighbor at all are assigned the value -1.
  */
 typedef struct
diff --git a/src/p8est_nodes.c b/src/p8est_nodes.c
index 2c6f748..5fde3a1 100644
--- a/src/p8est_nodes.c
+++ b/src/p8est_nodes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_nodes.h b/src/p8est_nodes.h
index 27d7aea..1a1c923 100644
--- a/src/p8est_nodes.h
+++ b/src/p8est_nodes.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_plex.c b/src/p8est_plex.c
index 5a84e28..0abb3b8 100644
--- a/src/p8est_plex.c
+++ b/src/p8est_plex.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_plex.h b/src/p8est_plex.h
index dbd528d..26fda37 100644
--- a/src/p8est_plex.h
+++ b/src/p8est_plex.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_points.c b/src/p8est_points.c
index b1dc59a..dd691dc 100644
--- a/src/p8est_points.c
+++ b/src/p8est_points.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_points.h b/src/p8est_points.h
index 29b2c03..42a39e7 100644
--- a/src/p8est_points.h
+++ b/src/p8est_points.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_search.c b/src/p8est_search.c
index 527cbbb..35da047 100644
--- a/src/p8est_search.c
+++ b/src/p8est_search.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_search.h b/src/p8est_search.h
index a726e8e..eb00210 100644
--- a/src/p8est_search.h
+++ b/src/p8est_search.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -24,6 +25,13 @@
 #ifndef P8EST_SEARCH_H
 #define P8EST_SEARCH_H
 
+/** \file p8est_search.h
+ * Search through quadrants, the local part of a forest, or the partition.
+ *
+ * This file provides several helper functions and recursive algorithms.
+ * \ingroup p8est
+ */
+
 #include <p8est.h>
 
 SC_EXTERN_C_BEGIN;
@@ -44,14 +52,14 @@ ssize_t             p8est_find_higher_bound (sc_array_t * array,
                                              const p8est_quadrant_t * q,
                                              size_t guess);
 
-/** Given a sorted \a array of quadrants that have a common ancestor at level
- * \a level, compute the \a indices of the first quadrant in each of the common
- * ancestor's children at level \a level + 1;
- * \param [in] array     The sorted array of quadrants of level > \a level.
+/** Given a sorted \b array of quadrants that have a common ancestor at level
+ * \b level, compute the \b indices of the first quadrant in each of the common
+ * ancestor's children at level \b level + 1.
+ * \param [in] array     The sorted array of quadrants of level > \b level.
  * \param [in] level     The level at which there is a common ancestor.
  * \param [in,out] indices     The indices of the first quadrant in each of
  *                             the ancestors's children, plus an additional
- *                             index on the end.  The quadrants of \a array
+ *                             index on the end.  The quadrants of \b array
  *                             that are descendants of child i have indices
  *                             between indices[i] and indices[i + 1] - 1.  If
  *                             indices[i] = indices[i+1], this indicates that
@@ -61,7 +69,9 @@ ssize_t             p8est_find_higher_bound (sc_array_t * array,
 void                p8est_split_array (sc_array_t * array, int level,
                                        size_t indices[]);
 
-/** Given two smallest quadrants, \a lq and \a uq, that mark the first and the
+/** Find the boundary points touched by a range of quadrants.
+ *
+ * Given two smallest quadrants, \b lq and \b uq, that mark the first and the
  * last quadrant in a range of quadrants, determine which portions of the tree
  * boundary the range touches.
  * \param [in] lq        The smallest quadrant at the start of the range: if
@@ -73,17 +83,17 @@ void                p8est_split_array (sc_array_t * array, int level,
  * \param [in] level     The level of the containing quadrant whose boundaries
  *                       are tested: 0 if we want to test the boundaries of the
  *                       whole tree.
- * \param [in/out] faces       An array of size 6 that is filled: faces[i] is
+ * \param [in,out] faces       An array of size 6 that is filled: faces[i] is
  *                             true if the range touches that face.
- * \param [in/out] edges       An array of size 12 that is filled: edges[i] is
+ * \param [in,out] edges       An array of size 12 that is filled: edges[i] is
  *                             true if the range touches that edge.
- * \param [in/out] corners     An array of size 8 that is filled: corners[i] is
+ * \param [in,out] corners     An array of size 8 that is filled: corners[i] is
  *                             true if the range touches that corner.
- *                             \faces, \edges or \corners may be NULL.
- * \return  Returns an int32_t encoded with the same information in \faces,
- *          \edges and \corners: the first (least) six bits represent the six
- *          faces, the next twelve bits represent the twelve edges, the next
- *          eight bits represent the eight corners.
+ *                             \b faces, \b edges or \b corners may be NULL.
+ * \return  Returns an int32_t encoded with the same information in \b faces,
+ *          \b edges and \b corners: the first (least) six bits represent the
+ *          six faces, the next twelve bits represent the twelve edges, the
+ *          next eight bits represent the eight corners.
  */
 int32_t             p8est_find_range_boundaries (p8est_quadrant_t * lq,
                                                  p8est_quadrant_t * uq,
@@ -93,7 +103,7 @@ int32_t             p8est_find_range_boundaries (p8est_quadrant_t * lq,
 /** Callback function to query the match of a "point" with a quadrant.
  *
  * This function can be called in two roles:  Per-quadrant, in which case the
- * parameter \a point is NULL, or per-point, possibly many times per quadrant.
+ * parameter \b point is NULL, or per-point, possibly many times per quadrant.
  *
  * \param [in] p8est        The forest to be queried.
  * \param [in] which_tree   The tree id under consideration.
@@ -107,10 +117,10 @@ int32_t             p8est_find_range_boundaries (p8est_quadrant_t * lq,
  *                          it is the (non-negative) index of the quadrant
  *                          relative to the processor-local quadrant storage.
  * \param [in] point        Representation of a "point"; user-defined.
- *                          If \a point is NULL, the callback may be used to
+ *                          If \b point is NULL, the callback may be used to
  *                          prepare quadrant-related search meta data.
- * \return                  If \a point is NULL, true if the search confined to
- *                          \a quadrant should be executed, false to skip it.
+ * \return                  If \b point is NULL, true if the search confined to
+ *                          \b quadrant should be executed, false to skip it.
  *                          Else, true if point may be contained in the
  *                          quadrant and false otherwise; the return value has
  *                          no effect on a leaf.
@@ -121,13 +131,21 @@ typedef int         (*p8est_search_query_t) (p8est_t * p8est,
                                              p4est_locidx_t local_num,
                                              void *point);
 
-/** Search "points" from a given set in the forest.
+/** Search through the local part of a forest.
+ * The search is especially efficient if multiple targets, called "points"
+ * below, are searched for simultaneously.
  *
  * The search runs over all local quadrants and proceeds recursively top-down.
+ * For each tree, it may start at the root of that tree, or further down at the
+ * root of the subtree that contains all of the tree's local quadrants.
+ * Likewise, some intermediate levels in the recursion may be skipped.
  * Its outer loop is thus a depth-first, processor-local forest traversal.
  * Each quadrant in that loop either is a leaf, or a (direct or indirect)
  * strict ancestor of a leaf.  On entering a new quadrant, a user-provided
  * quadrant-callback is executed.
+ *
+ * As a convenience, the user may provide anonymous "points" that are tracked
+ * down the forest.  This way one search call may be used for multiple targets.
  * The set of points that potentially matches a given quadrant diminishes from
  * the root down to the leaves:  For each quadrant, an inner loop over the
  * potentially matching points executes a point-callback for each candidate
@@ -142,18 +160,63 @@ typedef int         (*p8est_search_query_t) (p8est_t * p8est,
  *
  * \param [in] p8est        The forest to be searched.
  * \param [in] search_quadrant_fn   Executed once for each quadrant that is
- *                          entered.  This quadrant is always local.  If the
+ *                          entered.  This quadrant is always local, if not
+ *                          itself than at least one child of it.  If the
  *                          callback returns false, this quadrant and its
  *                          descendants are excluded from the search.
+ *                          Its \b point argument is always NULL.
  *                          May be NULL in which case it is ignored.
- * \param [in] search_point_fn      Must return true for a possible match.
+ * \param [in] search_point_fn      If \b points is not NULL, must be not NULL.
+ *                          Must return true for any possible matching point.
+ *                          If \b points is NULL, this callback is ignored.
  * \param [in] points       User-defined array of "points".
+ *                          If NULL, only the \b search_quadrant_fn callback
+ *                          is executed.  If that is NULL, this function noops.
+ *                          If not NULL, the \b search_point_fn is called on
+ *                          its members during the search.
  */
 void                p8est_search (p8est_t * p8est,
                                   p8est_search_query_t search_quadrant_fn,
                                   p8est_search_query_t search_point_fn,
                                   sc_array_t * points);
 
+/** Callback function for the traversal recursion.
+ * \param [in] p8est        The forest to traverse.
+ *                          Its local quadrants are never accessed.
+ * \param [in] which_tree   The tree number under consideration.
+ * \param [in] quadrant     This quadrant is not from local forest storage,
+ *                          and its user data is undefined.  It represents
+ *                          the branch of the forest in the top-down recursion.
+ * \param [in] pfirst       The lowest processor that owns part of \b quadrant.
+ *                          Guaranteed to be non-empty.
+ * \param [in] plast        The highest processor that owns part of \b quadrant.
+ *                          Guaranteed to be non-empty.  If this is equal to
+ *                          \b pfirst, then the recursion will stop for
+ *                          quadrant's branch after this function returns.
+ * \return                  If false, the recursion at quadrant is terminated.
+ *                          If true, it continues if \b pfirst < \b plast.
+ */
+typedef int         (*p8est_traverse_query_t) (p8est_t * p8est,
+                                               p4est_topidx_t which_tree,
+                                               p8est_quadrant_t * quadrant,
+                                               int pfirst, int plast);
+
+/** Traverse the global partition top-down.
+ * We proceed top-down through the partition, identically on all processors
+ * except for the results of a user-provided callback.  The recursion will only
+ * go down branches that are split between multiple processors.  The callback
+ * function can be used to stop a branch recursion even for split branches.
+ * \note Traversing the whole processor partition will likely by inefficient,
+ *       so sensible use of the callback function is advised.
+ * \param [in] p8est        The forest to traverse.
+ *                          Its local quadrants are never accessed.
+ * \param [in] traverse_fn  This function controls the recursion,
+ *                          which only continues deeper if this
+ *                          callback returns true for a branch quadrant.
+ */
+void                p8est_traverse (p8est_t * p8est,
+                                    p8est_traverse_query_t traverse_fn);
+
 SC_EXTERN_C_END;
 
-#endif
+#endif /* !P8EST_SEARCH_H */
diff --git a/src/p8est_tets_hexes.c b/src/p8est_tets_hexes.c
index 6fe97d1..b5c1081 100644
--- a/src/p8est_tets_hexes.c
+++ b/src/p8est_tets_hexes.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_tets_hexes.h b/src/p8est_tets_hexes.h
index a552132..4e041d8 100644
--- a/src/p8est_tets_hexes.h
+++ b/src/p8est_tets_hexes.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_vtk.c b/src/p8est_vtk.c
index b4afc09..bf5e4e1 100644
--- a/src/p8est_vtk.c
+++ b/src/p8est_vtk.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/src/p8est_vtk.h b/src/p8est_vtk.h
index ef38265..24988a9 100644
--- a/src/p8est_vtk.h
+++ b/src/p8est_vtk.h
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,7 +24,7 @@
 
 /** \file p8est_vtk.h
  *
- * Routines for printing a forest and associated fields to vtk format.
+ * Routines for printing a forest and associated fields to VTK format.
  *
  * \ingroup p8est
  */
@@ -36,159 +37,216 @@
 
 SC_EXTERN_C_BEGIN;
 
-/** This writes out the p8est in VTK format.
+/** Opaque context type for writing VTK output with multiple function calls.
+ */
+typedef struct p8est_vtk_context p8est_vtk_context_t;
+
+/** Write the p8est in VTK format.
  *
- * This is a convenience function for the special
- * case of writing out the tree id, quadrant level, and MPI rank only.
+ * This is a convenience function for the special case of writing out
+ * the tree id, quadrant level, and MPI rank only.
  * One file is written per MPI rank, and one meta file on rank 0.
+ * The quadrants are scaled to length .95; see \ref p8est_vtk_write_header.
  * This function will abort if there is a file error.
  *
  * \param [in] p8est    The p8est to be written.
- * \param [in] geom     A p8est_geometry_t structure or NULL for vertex space.
+ * \param [in] geom     A p8est_geometry_t structure or NULL for vertex space
+ *                      as defined by p8est->connectivity.
  * \param [in] filename The first part of the file name which will have the
  *                      MPI rank appended to it: The output file will be
- *                      filename_procNum.vtu, and the meta file filename.pvtu.
+ *                      filename_rank.vtu, and the meta file filename.pvtu.
  */
 void                p8est_vtk_write_file (p8est_t * p8est,
                                           p8est_geometry_t * geom,
                                           const char *filename);
 
-/** This writes out the p8est and any number of point fields in VTK format.
- *
- * This is a convenience function that will abort if there is a file error.
- *
- * \param [in] p8est    The p8est to be written.
- * \param [in] geom     A p8est_geometry_t structure or NULL for vertex space.
- * \param [in] scale    Double value between 0 and 1 to scale each quadrant.
- * \param [in] write_tree   Include the tree id as output field.
- * \param [in] write_level  Include the tree levels as output field.
- * \param [in] write_rank   Include the MPI rank as output field.
- * \param [in] wrap_tree    The MPI rank is written module wrap_tree, or 0.
- * \param filename      First part of the name, see p8est_vtk_write_file.
- * \param num_scalars   Number of scalar fields to write.
- * \param num_vectors   Number of vector fields to write.
- *
- * The variable arguments need to be pairs of (fieldname, fieldvalues)
- * where the scalars come first, then the vectors.
- */
-void                p8est_vtk_write_all (p8est_t * p8est,
-                                         p8est_geometry_t * geom,
-                                         double scale,
-                                         int write_tree, int write_level,
-                                         int write_rank, int wrap_rank,
-                                         int num_scalars, int num_vectors,
-                                         const char *filename, ...);
-
-/** This will write the header of the vtu file.
- *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of
- * fields.  The calling sequence would be something like
- *
- * \begincode
- * p8est_vtk_write_header(p8est, geom, 1., 1, 1, 1, 0, "output");
- * p8est_vtk_write_point_scalar (...);
- * ...
- * p8est_vtk_write_footer(p8est, "output");
- * \endcode
- *
- * \param p8est     The p8est to be written.
- * \param geom      A p8est_geometry_t structure or NULL for vertex space.
- * \param scale     The relative length factor of the quadrants.
- *                  Use 1.0 to fit quadrants exactly, less to create gaps.
- * \param write_tree    Boolean to determine if the tree id should be output.
- * \param write_level   Boolean to determine if the tree levels should be output.
- * \param write_rank    Boolean to determine if the MPI rank should be output.
- * \param wrap_rank Number to wrap around the rank with a modulo operation.
- *                  Can be 0 for no wrapping.
- * \param point_scalars  Comma-separated list of point scalar fields, or NULL.
- * \param point_vectors  Comma-separated list of point vector fields, or NULL.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
- *
- * \return          This returns 0 if no error and -1 if there is an error.
+/** The first call to write a VTK file using individual functions.
+ *
+ * Writing a VTK file is split into multiple functions that keep a context.
+ * This is the first function that allocates the opaque context structure.
+ * After allocation, further parameters can be set for the context.
+ * Then, the header, possible data fields, and the footer must be written.
+ * The process can be aborted any time by destroying the context.  In this
+ * case, open files are closed cleanly with only partially written content.
+ *
+ * \param p4est     The p8est to be written.
+ *                  If no geometry is specified in
+ *                  \ref p8est_vtk_context_set_geom, we require
+ *                  \b p8est->connectivity to have valid vertex arrays.
+ * \param filename  The first part of the name which will have the processor
+ *                  number appended to it (i.e., the output file will be
+ *                  filename_rank.vtu).  The parallel meta-files for Paraview
+ *                  and Visit use this basename too.
+ *                  We copy this filename to internal storage, so it is not
+ *                  needed to remain alive after calling this function.
+ * \return          A VTK context fur further use.
  */
-int                 p8est_vtk_write_header (p8est_t * p8est,
-                                            p8est_geometry_t * geom,
-                                            double scale,
-                                            int write_tree, int write_level,
-                                            int write_rank, int wrap_rank,
-                                            const char *point_scalars,
-                                            const char *point_vectors,
+p8est_vtk_context_t *p8est_vtk_context_new (p8est_t * p4est,
                                             const char *filename);
 
-/** This will write a scalar field to the vtu file.
- *
- * It is good practice to make sure that the scalar field also
- * exists in the comma separated string \a point_scalars passed
- * to \c p8est_vtk_write_header.
- *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of fields.
+/** Modify the geometry transformation registered in the context.
+ * After \ref p8est_vtk_context_new, it is at the default NULL.
+ * \param [in,out] cont         The context is modified.
+ *                              It must not yet have been used to start writing
+ *                              in \ref p8est_vtk_write_header.
+ * \param geom      A \ref p8est_geometry_t structure, or NULL for vertex space.
+ *                  If NULL, \b p8est->connectivity->vertices and
+ *                  \b tree_to_vertex must be non-NULL.
+ */
+void                p8est_vtk_context_set_geom (p8est_vtk_context_t * cont,
+                                                p8est_geometry_t * geom);
+
+/** Modify the context parameter for scaling the quadrants.
+ * After \ref p8est_vtk_context_new, it is at the default 0.95.
+ * \param [in,out] cont         The context is modified.
+ *                              It must not yet have been used to start writing
+ *                              in \ref p8est_vtk_write_header.
+ * \param [in] scale            Scale parameter must be in (0, 1].
+ */
+void                p8est_vtk_context_set_scale (p8est_vtk_context_t * cont,
+                                                 double scale);
+
+/** Modify the context parameter for expecting continuous point data.
+ * If set to true, the point data is understood as a continuous field.
+ * In this case, we can significantly reduce the file size when scale == 1.
+ * For discontinuous point data, it should be set to false.
+ * After \ref p8est_vtk_context_new, it is at the default false.
+ * \param [in,out] cont         The context is modified.
+ *                              It must not yet have been used to start writing
+ *                              in \ref p8est_vtk_write_header.
+ * \param [in] continuous       Boolean parameter.
+ */
+void                p8est_vtk_context_set_continuous (p8est_vtk_context_t *
+                                                      cont, int continuous);
+/** Cleanly destroy a \ref p8est_vtk_context_t structure.
  *
- * \param p8est     The p8est to be written.
- * \param geom      A p8est_geometry_t structure or NULL for vertex space.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
- * \param scalar_name The name of the scalar field.
- * \param values    The point values that will be written.
+ * This function closes all the file pointers and frees the context.
+ * Tt can be called even if the VTK output
+ * has only been partially written, the files' content will be incomplete.
  *
- * \return          This returns 0 if no error and -1 if there is an error.
+ * \param[in] context     The VTK file context to be destroyed.
  */
-int                 p8est_vtk_write_point_scalar (p8est_t * p8est,
-                                                  p8est_geometry_t * geom,
-                                                  const char *filename,
-                                                  const char *scalar_name,
-                                                  const double *values);
+void                p8est_vtk_context_destroy (p8est_vtk_context_t * context);
 
-/** This will write a 3-vector field to the vtu file.
- *
- * It is good practice to make sure that the vector field also
- * exists in the comma separated string \a point_vectors passed
- * to \c p8est_vtk_write_header.
- *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of fields.
+/** Write the VTK header.
  *
- * \param p8est     The p8est to be written.
- * \param geom      A p8est_geometry_t structure or NULL for vertex space.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
- * \param vector_name The name of the vector field.
- * \param values    The point values that will be written.
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of
+ * fields.  The calling sequence would be something like
  *
- * \return          This returns 0 if no error and -1 if there is an error.
+ *     vtk_context = p8est_vtk_context_new (p8est, "output");
+ *     p8est_vtk_context_set_* (vtk_context, parameter);
+ *     vtk_context = p8est_vtk_write_header (vtk_context, ...);
+ *     if (vtk_context == NULL) { error; }
+ *     vtk_context = p8est_vtk_write_cell_data (vtk_context, ...);
+ *     if (vtk_context == NULL) { error; }
+ *     vtk_context = p8est_vtk_write_point_data (vtk_context, ...);
+ *     if (vtk_context == NULL) { error; }
+ *     retval = p8est_vtk_write_footer (vtk_context);
+ *     if (retval) { error; }
+ *
+ * \param [in,out] cont    A VTK context created by \ref p8est_vtk_context_new.
+ *                         None of the vtk_write functions must have been called.
+ *                         This context is the return value if no error occurs.
+ *
+ * \return          On success, an opaque context (p8est_vtk_context_t) pointer
+ *                  that must be passed to subsequent p8est_vtk calls.  It is
+ *                  required to call \ref p8est_vtk_write_footer eventually with
+ *                  this value.  Returns NULL on error.
+ */
+p8est_vtk_context_t *p8est_vtk_write_header (p8est_vtk_context_t * cont);
+
+/** Write VTK cell data.
+ *
+ * There are options to have this function write
+ * the tree id, quadrant level, or MPI rank without explicit input data.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of
+ * fields.
+ *
+ * \param [in,out] cont    A VTK context created by \ref p8est_vtk_context_new.
+ * \param [in] write_tree  Boolean to determine if the tree id should be output.
+ * \param [in] write_level Boolean to determine if the tree levels should be output.
+ * \param [in] write_rank  Boolean to determine if the MPI rank should be output.
+ * \param [in] wrap_rank   Number to wrap around the rank with a modulo operation.
+ *                         Can be 0 for no wrapping.
+ * \param [in] num_cell_scalars Number of cell scalar datasets to output.
+ * \param [in] num_cell_vectors Number of cell vector datasets to output.
+ *
+ * The variable arguments need to be pairs of (fieldname, fieldvalues), followed
+ * by a final argument of the VTK context cont (same as the first argument).
+ * The cell scalar pairs come first, followed by the cell vector pairs, then cont.
+ * Each 'fieldname' argument shall be a char string containing the name of the data
+ * contained in the following 'fieldvalues'.  Each of the 'fieldvalues'
+ * arguments shall be an sc_array_t * holding double variables.  The number of
+ * doubles in each sc_array must be exactly \a p4est->local_num_quadrants for
+ * scalar data and \a 3*p4est->local_num_quadrants for vector data.
+ *
+ * \note The current p8est_vtk_context_t structure, \a cont, must be the first
+ * and the last argument
+ * of any call to this function; this argument is used to validate that the
+ * correct number of variable arguments have been provided.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
  */
-int                 p8est_vtk_write_point_vector (p8est_t * p8est,
-                                                  p8est_geometry_t * geom,
-                                                  const char *filename,
-                                                  const char *vector_name,
-                                                  const double *values);
+p8est_vtk_context_t *p8est_vtk_write_cell_dataf (p8est_vtk_context_t * cont,
+                                                 int write_tree,
+                                                 int write_level,
+                                                 int write_rank,
+                                                 int wrap_rank,
+                                                 int num_cell_scalars,
+                                                 int num_cell_vectors, ...);
+
+/** Write VTK point data.
+ *
+ * Writing a VTK file is split into a few routines.
+ * This allows there to be an arbitrary number of
+ * fields.
+ *
+ * \param [in,out] cont    A VTK context created by \ref p8est_vtk_context_new.
+ * \param [in] num_point_scalars Number of point scalar datasets to output.
+ * \param [in] num_point_vectors Number of point vector datasets to output.
+ *
+ * The variable arguments need to be pairs of (fieldname, fieldvalues) where
+ * the point scalar pairs come first, followed by the point vector pairs.  Each
+ * 'fieldname' argument shall be a char string containing the name of the data
+ * contained in the following 'fieldvalues'. Each of the 'fieldvalues'
+ * arguments shall be an sc_array_t * holding double variables. The number of
+ * doubles in each sc_array must be exactly the number of components (1 for
+ * scalar and 3 for vector) times 8 times number of elements.
+ *
+ * \note The current
+ * p8est_vtk_context_t structure, cont, must be the last argument of any call
+ * to this function; this argument is used to validate that the correct number
+ * of variable arguments have been provided.
+ *
+ * \note The number of point scalar data in each
+ * sc_array must be exactly \a P8EST_CHILDREN*local_num_quadrants, and the
+ * number of point vector data must be exactly \a
+ * 3*P8EST_CHILDREN*local_num_quadrants. I.e. there must be data for every
+ * corner of every quadrant in the \a p8est, even if the corner is shared by
+ * multiple quadrants.
+ *
+ * \return          On success, the context that has been passed in.
+ *                  On failure, returns NULL and deallocates the context.
+ */
+p8est_vtk_context_t *p8est_vtk_write_point_dataf (p8est_vtk_context_t * cont,
+                                                  int num_point_scalars,
+                                                  int num_point_vectors, ...);
 
-/** This will write the footer of the vtu file.
- *
- * Writing a VTK file is split into a couple of routines.
- * The allows there to be an arbitrary number of
- * fields.  To write out two fields the
- * calling sequence would be something like
+/** Write the VTU footer and clean up.
  *
- * \begincode
- * p8est_vtk_write_header(p8est, ..., "output");
- * p8est_vtk_write_footer(p8est, "output");
- * \endcode
+ * Writing a VTK file is split into a few routines.
+ * This function writes the footer information to the VTK file and cleanly
+ * destroys the VTK context.
  *
- * \param p8est     The p8est to be written.
- * \param filename  The first part of the name which will have
- *                  the proc number appended to it (i.e., the
- *                  output file will be filename_procNum.vtu).
+ * \param [in] cont Context is deallocated before the function returns.
  *
  * \return          This returns 0 if no error and -1 if there is an error.
  */
-int                 p8est_vtk_write_footer (p8est_t * p8est,
-                                            const char *filename);
+int                 p8est_vtk_write_footer (p8est_vtk_context_t * cont);
 
 SC_EXTERN_C_END;
 
diff --git a/src/p8est_wrap.c b/src/p8est_wrap.c
index 6cab255..1ba3954 100644
--- a/src/p8est_wrap.c
+++ b/src/p8est_wrap.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/p8est_wrap.h b/src/p8est_wrap.h
index 7555d74..45ea270 100644
--- a/src/p8est_wrap.h
+++ b/src/p8est_wrap.h
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -23,11 +25,20 @@
 #ifndef P8EST_WRAP_H
 #define P8EST_WRAP_H
 
+/** \file p8est_wrap.h
+ * The logic in p8est_wrap encapsulates core p4est data structures and provides
+ * functions that clarify the mark-adapt-partition cycle.  There is also an
+ * element iterator that can replace the nested loops over trees and tree
+ * quadrants, respectively, which can help make application code cleaner.
+ */
+
 #include <p8est_mesh.h>
+#include <p8est_extended.h>
+#include <sc_refcount.h>
 
 SC_EXTERN_C_BEGIN;
 
-/*** COMPLETE INTERNAL STATE OF P4EST ***/
+/*** COMPLETE INTERNAL STATE OF P8EST ***/
 
 typedef enum p8est_wrap_flags
 {
@@ -42,12 +53,34 @@ typedef struct p8est_wrap
   /* this member is never used or changed by p8est_wrap */
   void               *user_pointer;     /**< Convenience member for users */
 
+  /** If true, this wrap has NULL for ghost, mesh, and flag members.
+   * If false, they are properly allocated and kept current internally. */
+  int                 hollow;
+
+  /** Non-negative integer tells us how many adaptations to wait
+   * before any given quadrent may be coarsened again. */
+  int                 coarsen_delay;
+
+  /** Boolean: If true, we delay coarsening not only after refinement,
+   * but also between subsequent coarsenings of the same quadrant. */
+  int                 coarsen_affect;
+
+  /** This reference counter is a workaround for internal use only.
+   * Until we have refcounting/copy-on-write for the connectivity,
+   * we count the references to conn by copies of this wrap structure.
+   * There must be no external references left when this wrap is destroyed.
+   */
+  sc_refcount_t       conn_rc;
+  p8est_connectivity_t *conn;
+  struct p8est_wrap  *conn_owner;
+
   /* these members are considered public and read-only */
   int                 p4est_dim;
   int                 p4est_half;
   int                 p4est_faces;
   int                 p4est_children;
-  p8est_connectivity_t *conn;
+  p8est_connect_type_t btype;
+  p8est_replace_t     replace_fn;
   p8est_t            *p4est;    /**< p4est->user_pointer is used internally */
 
   /* anything below here is considered private und should not be touched */
@@ -55,7 +88,7 @@ typedef struct p8est_wrap
   uint8_t            *flags, *temp_flags;
   p4est_locidx_t      num_refine_flags, inside_counter, num_replaced;
 
-  /* for ghost and mesh use p4est_wrap_get_ghost, _mesh declared below */
+  /* for ghost and mesh use p8est_wrap_get_ghost, _mesh declared below */
   p8est_ghost_t      *ghost;
   p8est_mesh_t       *mesh;
   p8est_ghost_t      *ghost_aux;
@@ -64,7 +97,9 @@ typedef struct p8est_wrap
 }
 p8est_wrap_t;
 
-/** Create a p4est wrapper from a given connectivity structure.
+/** Create a p8est wrapper from a given connectivity structure.
+ * The ghost and mesh members are initialized as well as the flags.
+ * The btype is set to P8EST_CONNECT_FULL.
  * \param [in] mpicomm        We expect sc_MPI_Init to be called already.
  * \param [in] conn           Connectivity structure.  Wrap takes ownership.
  * \param [in] initial_level  Initial level of uniform refinement.
@@ -74,6 +109,63 @@ p8est_wrap_t       *p8est_wrap_new_conn (sc_MPI_Comm mpicomm,
                                          p8est_connectivity_t * conn,
                                          int initial_level);
 
+/** Create a wrapper for a given p8est structure.
+ * \param [in,out] p8est      Valid p8est object that we will own.
+ *                            We take ownership of its connectivity too.
+ *                            Its user pointer must be NULL and will be changed.
+ * \param [in] hollow         Do not allocate flags, ghost, and mesh members.
+ * \param [in] btype          The neighborhood used for balance, ghost, mesh.
+ * \param [in] replace_fn     Callback to replace quadrants during refinement,
+ *                            coarsening or balancing in \ref p8est_wrap_adapt.
+ *                            May be NULL.
+ * \param [in] user_pointer   Set the user pointer in \ref p8est_wrap_t.
+ *                            Subsequently, we will never access it.
+ * \return                    A fully initialized p8est_wrap structure.
+ */
+p8est_wrap_t       *p8est_wrap_new_p8est (p8est_t * p8est, int hollow,
+                                          p8est_connect_type_t btype,
+                                          p8est_replace_t replace_fn,
+                                          void *user_pointer);
+
+/** Create a p8est wrapper from a given connectivity structure.
+ * Like p8est_wrap_new_conn, but with extra parameters \a hollow and \a btype.
+ * \param [in] mpicomm        We expect sc_MPI_Init to be called already.
+ * \param [in] conn           Connectivity structure.  Wrap takes ownership.
+ * \param [in] initial_level  Initial level of uniform refinement.
+ *                            No effect if less/equal to zero.
+ * \param [in] hollow         Do not allocate flags, ghost, and mesh members.
+ * \param [in] btype          The neighborhood used for balance, ghost, mesh.
+ * \param [in] replace_fn     Callback to replace quadrants during refinement,
+ *                            coarsening or balancing in \ref p8est_wrap_adapt.
+ *                            May be NULL.
+ * \param [in] user_pointer   Set the user pointer in \ref p8est_wrap_t.
+ *                            Subsequently, we will never access it.
+ * \return                    A fully initialized p8est_wrap structure.
+ */
+p8est_wrap_t       *p8est_wrap_new_ext (sc_MPI_Comm mpicomm,
+                                        p8est_connectivity_t * conn,
+                                        int initial_level, int hollow,
+                                        p8est_connect_type_t btype,
+                                        p8est_replace_t replace_fn,
+                                        void *user_pointer);
+
+/** Create a p8est wrapper from an existing one.
+ * \note This wrapper must be destroyed before the original one.
+ * We set it to hollow and copy the original p8est data structure.
+ * \param [in,out] source   We access the source for debugging purposes.
+ * \param [in] data_size    The data size installed in the copied forest.
+ * \param [in] replace_fn     Callback to replace quadrants during refinement,
+ *                            coarsening or balancing in \ref p8est_wrap_adapt.
+ *                            May be NULL.
+ * \param [in] user_pointer   Set the user pointer in \ref p8est_wrap_t.
+ *                            Subsequently, we will never access it.
+ * \return                    A fully initialized p8est_wrap structure.
+ */
+p8est_wrap_t       *p8est_wrap_new_copy (p8est_wrap_t * source,
+                                         size_t data_size,
+                                         p8est_replace_t replace_fn,
+                                         void *user_pointer);
+
 /** Create p8est and auxiliary data structures.
  * Expects sc_MPI_Init to be called beforehand.
  */
@@ -81,26 +173,58 @@ p8est_wrap_t       *p8est_wrap_new_unitcube (sc_MPI_Comm mpicomm,
                                              int initial_level);
 p8est_wrap_t       *p8est_wrap_new_rotwrap (sc_MPI_Comm mpicomm,
                                             int initial_level);
+p8est_wrap_t       *p8est_wrap_new_brick (sc_MPI_Comm mpicomm,
+                                          int bx, int by, int bz,
+                                          int px, int py, int pz,
+                                          int initial_level);
 
 /** Passes sc_MPI_COMM_WORLD to p8est_wrap_new_unitcube. */
 p8est_wrap_t       *p8est_wrap_new_world (int initial_level);
 void                p8est_wrap_destroy (p8est_wrap_t * pp);
 
+/** Change hollow status of the wrap.
+ * It is legal to set to the current hollow status.
+ * \param [in,out] pp   The present wrap structure, hollow or not.
+ * \param [in] hollow   The desired hollow status.
+ */
+void                p8est_wrap_set_hollow (p8est_wrap_t * pp, int hollow);
+
+/** Set a parameter that delays coarsening after adaptation.
+ * If positive each quadrant counts the number of adaptations it has survived.
+ * Calling this function initializes all quadrant counters to zero.
+ * On adaptation we only coarsen a quadrant if it is old enough.
+ * Optionally, we can also delay the time between subsequent coarsenings.
+ * \param [in,out] pp           A valid p8est_wrap structure.
+ * \param [in] coarsen_delay    Set how many adaptation cycles a quadrant has
+ *                              to wait to be allowed to coarsen.
+ *                              Non-negative number; 0 disables the feature.
+ *                              Suggested default value: not larger than 2.
+ * \param [in] coarsen_affect   Boolean; If true, we not only count from the
+ *                              most recent refinement but also between
+ *                              subsequent coarsenings.
+ *                              Suggested default: 0.
+ */
+void                p8est_wrap_set_coarsen_delay (p8est_wrap_t * pp,
+                                                  int coarsen_delay,
+                                                  int coarsen_affect);
+
 /** Return the appropriate ghost layer.
  * This function is necessary since two versions may exist simultaneously
  * after refinement and before partition/complete.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  * */
 p8est_ghost_t      *p8est_wrap_get_ghost (p8est_wrap_t * pp);
 
 /** Return the appropriate mesh structure.
  * This function is necessary since two versions may exist simultaneously
  * after refinement and before partition/complete.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  * */
 p8est_mesh_t       *p8est_wrap_get_mesh (p8est_wrap_t * pp);
 
 /** Mark a local element for refinement.
  * This will cancel any coarsening mark set previously for this element.
- * \param [in,out] wrap The p8est wrapper to work with.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  * \param [in] which_tree The number of the tree this element lives in.
  * \param [in] which_quad The number of this element relative to its tree.
  */
@@ -110,7 +234,7 @@ void                p8est_wrap_mark_refine (p8est_wrap_t * pp,
 
 /** Mark a local element for coarsening.
  * This will cancel any refinement mark set previously for this element.
- * \param [in,out] wrap The p8est wrapper to work with.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  * \param [in] which_tree The number of the tree this element lives in.
  * \param [in] which_quad The number of this element relative to its tree.
  */
@@ -122,6 +246,7 @@ void                p8est_wrap_mark_coarsen (p8est_wrap_t * pp,
  * Checks pp->flags as per-quadrant input against p8est_wrap_flags_t.
  * The pp->flags array is updated along with p8est and reset to zeros.
  * Creates ghost_aux and mesh_aux to represent the intermediate mesh.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  * \return          boolean whether p8est has changed.
  *                  If true, partition must be called.
  *                  If false, partition must not be called,
@@ -132,22 +257,42 @@ int                 p8est_wrap_adapt (p8est_wrap_t * pp);
 /** Call p8est_partition for equal leaf distribution.
  * Frees the old ghost and mesh first and updates pp->flags along with p8est.
  * Creates ghost and mesh to represent the new mesh.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  * \param [in] weight_exponent      Integer weight assigned to each leaf
  *                  according to 2 ** (level * exponent).  Passing 0 assigns
  *                  equal weight to all leaves.  Passing 1 increases the
  *                  leaf weight by a factor of two for each level increase.
  *                  CURRENTLY ONLY 0 AND 1 ARE LEGAL VALUES.
- * \return          boolean whether p4est has changed.
+ * \param [out] unchanged_first
+ *                  If not NULL, is assigned the processor-local index of the
+ *                  first local quadrant that has stayed on this processor.  If
+ *                  no quadrant has stayed, the value is set to zero.
+ *                  This number is in reference to the new (output) partition.
+ * \param [out] unchanged_length
+ *                  If not NULL, is assigned the number of quadrants that have
+ *                  stayed on this processor.  If no quadrant has stayed, the
+ *                  value is set to zero.
+ * \param [out] unchanged_old_first
+ *                  If not NULL, is assigned the processor-local index of the
+ *                  first local quadrant that has stayed with reference to
+ *                  the old (input) partition.  If no quadrant has stayed,
+ *                  the value is set to zero.
+ * \return          boolean whether p8est has changed.
  *                  If true, complete must be called.
  *                  If false, complete must not be called.
  */
 int                 p8est_wrap_partition (p8est_wrap_t * pp,
-                                          int weight_exponent);
+                                          int weight_exponent,
+                                          p4est_locidx_t * unchanged_first,
+                                          p4est_locidx_t * unchanged_length,
+                                          p4est_locidx_t *
+                                          unchanged_old_first);
 
 /** Free memory for the intermediate mesh.
  * Sets mesh_aux and ghost_aux to NULL.
  * This function must be used if both refinement and partition effect changes.
  * After this call, we are ready for another mark-refine-partition cycle.
+ * \param [in,out] pp The p8est wrapper to work with, must not be hollow.
  */
 void                p8est_wrap_complete (p8est_wrap_t * pp);
 
@@ -155,27 +300,50 @@ void                p8est_wrap_complete (p8est_wrap_t * pp);
 
 typedef struct p8est_wrap_leaf
 {
-  p8est_wrap_t       *pp;
+  p8est_wrap_t       *pp;             /**< Must contain a valid ghost */
+
+  /* Information about the current quadrant */
+  p4est_topidx_t      which_tree;     /**< Current tree number */
+  p4est_locidx_t      which_quad;     /**< Quadrant number relative to tree */
+  p4est_locidx_t      local_quad;     /**< Quadrant number relative to proc */
+  p8est_tree_t       *tree;           /**< Current tree */
+  sc_array_t         *tquadrants;     /**< Current tree's quadrants */
+  p8est_quadrant_t   *quad;           /**< Current quadrant */
+#if 0                           /* DEPRECATED -- anyone using them? */
   int                 level;
-  p4est_topidx_t      which_tree;
-  p4est_locidx_t      which_quad;
-  p4est_locidx_t      total_quad;
-  p8est_tree_t       *tree;
-  p8est_quadrant_t   *quad;
   double              lowerleft[3];
   double              upperright[3];
+#endif
+
+  /* Information about parallel neighbors */
+  int                 is_mirror;      /**< Quadrant at parallel boundary? */
+  sc_array_t         *mirrors;        /**< If not NULL, from pp's ghost */
+  p4est_locidx_t      nm;             /**< Internal: mirror counter */
+  p4est_locidx_t      next_mirror_quadrant;     /**< Internal: next */
 }
 p8est_wrap_leaf_t;
 
-/* Create an iterator over the leaves in the forest.
+/** Determine whether we have just entered a different tree */
+#define P8EST_LEAF_IS_FIRST_IN_TREE(wleaf) ((wleaf)->which_quad == 0)
+
+/* Create an iterator over the local leaves in the forest.
  * Returns a newly allocated state containing the first leaf,
  * or NULL if the local partition of the tree is empty.
+ * \param [in] pp   Legal p8est_wrap structure, hollow or not.
+ * \param [in] track_mirrors    If true, \a pp must not be hollow and mirror
+ *                              information from the ghost layer is stored.
+ * \return          NULL if processor is empty, otherwise a leaf iterator for
+ *                  subsequent use with \a p8est_wrap_leaf_next.
  */
-p8est_wrap_leaf_t  *p8est_wrap_leaf_first (p8est_wrap_t * pp);
+p8est_wrap_leaf_t  *p8est_wrap_leaf_first (p8est_wrap_t * pp,
+                                           int track_mirrors);
 
 /* Move the forest leaf iterator forward.
- * Returns the state that was input with information for the next leaf,
- * or NULL and deallocates the input if called with the last leaf.
+ * \param [in,out] leaf     A non-NULL leaf iterator created by
+ *                          \ref p8est_wrap_leaf_first.
+ * \return          The state that was input with updated information for the
+ *                  next leaf, or NULL and deallocates the input if called with
+ *                  the last leaf on this processor.
  */
 p8est_wrap_leaf_t  *p8est_wrap_leaf_next (p8est_wrap_leaf_t * leaf);
 
diff --git a/test/Makefile.am b/test/Makefile.am
index 31f84fe..afcb9ba 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -11,13 +11,17 @@ p4est_test_programs += \
         test/p4est_test_partition test/p4est_test_coarsen \
         test/p4est_test_valid test/p4est_test_balance_type \
         test/p4est_test_loadsave test/p4est_test_order \
-        test/p4est_test_ghost \
+        test/p4est_test_load test/p4est_test_ghost \
+        test/p4est_test_conn_transformation \
         test/p4est_test_iterate test/p4est_test_lnodes \
         test/p4est_test_search test/p4est_test_brick \
+	test/p4est_test_complete_subtree \
         test/p4est_test_partition_corr \
         test/p4est_test_conn_complete test/p4est_test_balance_seeds \
         test/p4est_test_wrap test/p4est_test_replace test/p4est_test_join \
-        test/p4est_test_conn_reduce test/p4est_test_plex
+        test/p4est_test_conn_reduce test/p4est_test_plex \
+        test/p4est_test_connrefine \
+        test/p4est_test_subcomm
 if P4EST_WITH_METIS
 p4est_test_programs += \
         test/p4est_test_reorder
@@ -30,21 +34,37 @@ p4est_test_programs += \
         test/p8est_test_valid test/p8est_test_balance_type \
         test/p8est_test_face_transform test/p8est_test_edge_face_corners \
         test/p8est_test_periodic test/p8est_test_loadsave \
-        test/p8est_test_ghost \
+        test/p8est_test_load test/p8est_test_ghost \
+        test/p8est_test_conn_transformation \
         test/p8est_test_iterate test/p8est_test_lnodes \
         test/p8est_test_search test/p8est_test_brick \
         test/p8est_test_partition_corr \
         test/p8est_test_conn_complete test/p8est_test_balance_seeds \
         test/p8est_test_wrap test/p8est_test_replace test/p8est_test_join \
-        test/p8est_test_conn_reduce test/p8est_test_plex
+        test/p8est_test_conn_reduce test/p8est_test_plex \
+        test/p8est_test_connrefine \
+        test/p8est_test_subcomm
 if P4EST_WITH_METIS
 p4est_test_programs += \
         test/p8est_test_reorder
 endif
 endif
+if P4EST_ENABLE_BUILD_2D
+if P4EST_ENABLE_BUILD_3D
+if P4EST_ENABLE_BUILD_P6EST
+p4est_test_programs += test/p6est_test_all
+endif
+endif
+endif
 
 check_PROGRAMS += $(p4est_test_programs)
 
+## We must wrap this into P4EST_ENABLE_BUILD_2D etc.
+## Otherwise automake generates the targets and it will be legal
+## to call make test/p4est_test_comm even when BUILD_2D is off,
+## which will fail due to missing dependencies from libp4est.so.
+
+if P4EST_ENABLE_BUILD_2D
 test_p4est_test_comm_SOURCES = test/test_comm.c
 test_p4est_test_hash_SOURCES = test/test_hash.c
 test_p4est_test_quadrants_SOURCES = test/test_quadrants2.c
@@ -55,10 +75,13 @@ test_p4est_test_coarsen_SOURCES = test/test_coarsen2.c
 test_p4est_test_valid_SOURCES = test/test_valid2.c
 test_p4est_test_balance_type_SOURCES = test/test_balance_type2.c
 test_p4est_test_loadsave_SOURCES = test/test_loadsave2.c
+test_p4est_test_load_SOURCES = test/test_load2.c
 test_p4est_test_ghost_SOURCES = test/test_ghost2.c
+test_p4est_test_conn_transformation_SOURCES = test/test_conn_transformation2.c
 test_p4est_test_iterate_SOURCES = test/test_iterate2.c
 test_p4est_test_lnodes_SOURCES = test/test_lnodes2.c
 test_p4est_test_search_SOURCES = test/test_search2.c
+test_p4est_test_complete_subtree_SOURCES = test/test_complete_subtree.c
 test_p4est_test_brick_SOURCES = test/test_brick2.c
 test_p4est_test_partition_corr_SOURCES = test/test_partition_corr2.c
 test_p4est_test_conn_complete_SOURCES = test/test_conn_complete2.c
@@ -68,7 +91,17 @@ test_p4est_test_replace_SOURCES = test/test_replace2.c
 test_p4est_test_join_SOURCES = test/test_join2.c
 test_p4est_test_conn_reduce_SOURCES = test/test_conn_reduce2.c
 test_p4est_test_plex_SOURCES = test/test_plex2.c
+test_p4est_test_connrefine_SOURCES = test/test_connrefine2.c
+test_p4est_test_subcomm_SOURCES = test/test_subcomm2.c
+if P4EST_WITH_METIS
+test_p4est_test_reorder_SOURCES = test/test_reorder2.c
+endif
+
+test_p4est_test_plex_CPPFLAGS = @P4EST_PETSC_INCLUDE_DIRS@ $(AM_CPPFLAGS)
+test_p4est_test_plex_LDADD = @P4EST_PETSC_LINK_LIBS@ $(LDADD)
+endif
 
+if P4EST_ENABLE_BUILD_3D
 test_p8est_test_quadrants_SOURCES = test/test_quadrants3.c
 test_p8est_test_balance_SOURCES = test/test_balance3.c
 test_p8est_test_partition_SOURCES = test/test_partition3.c
@@ -79,7 +112,9 @@ test_p8est_test_face_transform_SOURCES = test/test_face_transform3.c
 test_p8est_test_edge_face_corners_SOURCES = test/test_edge_face_corners3.c
 test_p8est_test_periodic_SOURCES = test/test_periodic3.c
 test_p8est_test_loadsave_SOURCES = test/test_loadsave3.c
+test_p8est_test_load_SOURCES = test/test_load3.c
 test_p8est_test_ghost_SOURCES = test/test_ghost3.c
+test_p8est_test_conn_transformation_SOURCES = test/test_conn_transformation3.c
 test_p8est_test_brick_SOURCES = test/test_brick3.c
 test_p8est_test_iterate_SOURCES = test/test_iterate3.c
 test_p8est_test_lnodes_SOURCES = test/test_lnodes3.c
@@ -92,17 +127,24 @@ test_p8est_test_replace_SOURCES = test/test_replace3.c
 test_p8est_test_join_SOURCES = test/test_join3.c
 test_p8est_test_conn_reduce_SOURCES = test/test_conn_reduce3.c
 test_p8est_test_plex_SOURCES = test/test_plex3.c
-
-test_p4est_test_plex_CPPFLAGS = @P4EST_PETSC_INCLUDE_DIRS@ $(AM_CPPFLAGS) 
-test_p8est_test_plex_CPPFLAGS = @P4EST_PETSC_INCLUDE_DIRS@ $(AM_CPPFLAGS) 
-test_p4est_test_plex_LDADD = @P4EST_PETSC_LINK_LIBS@ $(LDADD) 
-test_p8est_test_plex_LDADD = @P4EST_PETSC_LINK_LIBS@ $(LDADD) 
-
+test_p8est_test_connrefine_SOURCES = test/test_connrefine3.c
+test_p8est_test_subcomm_SOURCES = test/test_subcomm3.c
 if P4EST_WITH_METIS
-test_p4est_test_reorder_SOURCES = test/test_reorder2.c
 test_p8est_test_reorder_SOURCES = test/test_reorder3.c
 endif
 
+test_p8est_test_plex_CPPFLAGS = @P4EST_PETSC_INCLUDE_DIRS@ $(AM_CPPFLAGS)
+test_p8est_test_plex_LDADD = @P4EST_PETSC_LINK_LIBS@ $(LDADD)
+endif
+
+if P4EST_ENABLE_BUILD_2D
+if P4EST_ENABLE_BUILD_3D
+if P4EST_ENABLE_BUILD_P6EST
+test_p6est_test_all_SOURCES = test/test_all6.c
+endif
+endif
+endif
+
 TESTS += $(p4est_test_programs)
 
 LINT_CSOURCES += \
@@ -116,10 +158,13 @@ LINT_CSOURCES += \
         $(test_p4est_test_valid_SOURCES) \
         $(test_p4est_test_balance_type_SOURCES) \
         $(test_p4est_test_loadsave_SOURCES) \
+        $(test_p4est_test_load_SOURCES) \
         $(test_p4est_test_ghost_SOURCES) \
+        $(test_p4est_test_conn_transformation_SOURCES) \
         $(test_p4est_test_iterate_SOURCES) \
         $(test_p4est_test_lnodes_SOURCES) \
         $(test_p4est_test_search_SOURCES) \
+        $(test_p4est_test_complete_subtree_SOURCES) \
         $(test_p4est_test_brick_SOURCES) \
         $(test_p4est_test_partition_corr_SOURCES) \
         $(test_p4est_test_reorder_SOURCES) \
@@ -129,6 +174,8 @@ LINT_CSOURCES += \
         $(test_p4est_test_join_SOURCES) \
         $(test_p4est_test_conn_reduce_SOURCES) \
         $(test_p4est_test_plex_SOURCES) \
+        $(test_p4est_test_connrefine_SOURCES) \
+        $(test_p4est_test_subcomm_SOURCES) \
         $(test_p8est_test_quadrants_SOURCES) \
         $(test_p8est_test_balance_SOURCES) \
         $(test_p8est_test_partition_SOURCES) \
@@ -139,7 +186,9 @@ LINT_CSOURCES += \
         $(test_p8est_test_edge_face_corners_SOURCES) \
         $(test_p8est_test_periodic_SOURCES) \
         $(test_p8est_test_loadsave_SOURCES) \
+        $(test_p8est_test_load_SOURCES) \
         $(test_p8est_test_ghost_SOURCES) \
+        $(test_p8est_test_conn_transformation_SOURCES) \
         $(test_p8est_test_brick_SOURCES) \
         $(test_p8est_test_iterate_SOURCES) \
         $(test_p8est_test_lnodes_SOURCES) \
@@ -151,7 +200,10 @@ LINT_CSOURCES += \
         $(test_p8est_test_replace_SOURCES) \
         $(test_p8est_test_join_SOURCES) \
         $(test_p8est_test_conn_reduce_SOURCES) \
-        $(test_p8est_test_plex_SOURCES)
+        $(test_p8est_test_plex_SOURCES) \
+        $(test_p8est_test_connrefine_SOURCES) \
+        $(test_p8est_test_subcomm_SOURCES) \
+        $(test_p6est_test_all_SOURCES)
 
 if P4EST_WITH_METIS
 LINT_CSOURCES += \
diff --git a/example/p6est/test/test_all.c b/test/test_all6.c
similarity index 97%
rename from example/p6est/test/test_all.c
rename to test/test_all6.c
index f002c73..fbe20e8 100644
--- a/example/p6est/test/test_all.c
+++ b/test/test_all6.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2014 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -170,7 +171,7 @@ main (int argc, char **argv)
   double              height[3] = { 0., 0., 0.1 };
   int                 i;
   int                 vtk;
-  unsigned            crc_computed;
+  unsigned            crc_computed = 0;
   sc_options_t       *opt;
   int                 first_argc;
   const char         *config_name;
@@ -232,8 +233,9 @@ main (int argc, char **argv)
   p6est_destroy (p6est);
 
   sc_flops_snap (&fi, &snapshot);
-  p6est = p6est_new_ext (mpicomm, conn, 0, refine_level, refine_zlevel, 1, 3,
-                         init_fn, TEST_USER_POINTER);
+  p6est =
+    p6est_new_ext (mpicomm, conn, 0, refine_level, refine_zlevel, 3, 1, 3,
+                   init_fn, TEST_USER_POINTER);
   sc_flops_shot (&fi, &snapshot);
   sc_stats_set1 (&stats[TIMINGS_NEW_EXT], snapshot.iwtime, "New extended");
 
@@ -375,9 +377,11 @@ main (int argc, char **argv)
     p6est_lnodes_destroy (lnodes);
   }
 
+#ifdef P4EST_HAVE_ZLIB
   crc_computed = p6est_checksum (p6est);
 
   P4EST_GLOBAL_PRODUCTIONF ("p6est checksum 0x%08x\n", crc_computed);
+#endif
 
   if (save_filename) {
     sc_flops_snap (&fi, &snapshot);
diff --git a/test/test_balance2.c b/test/test_balance2.c
index 13c3a18..cc9f7aa 100644
--- a/test/test_balance2.c
+++ b/test/test_balance2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_balance3.c b/test/test_balance3.c
index 0f71eac..9a4068f 100644
--- a/test/test_balance3.c
+++ b/test/test_balance3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_balance_seeds2.c b/test/test_balance_seeds2.c
index 4bede93..e746f40 100644
--- a/test/test_balance_seeds2.c
+++ b/test/test_balance_seeds2.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_balance_seeds3.c b/test/test_balance_seeds3.c
index 6d73bfc..7d2588d 100644
--- a/test/test_balance_seeds3.c
+++ b/test/test_balance_seeds3.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,5 +24,3 @@
 
 #include <p4est_to_p8est.h>
 #include "test_balance_seeds2.c"
-
-/* EOF test_balance_seeds3.c */
diff --git a/test/test_balance_type2.c b/test/test_balance_type2.c
index f261196..3caf7c1 100644
--- a/test/test_balance_type2.c
+++ b/test/test_balance_type2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_balance_type3.c b/test/test_balance_type3.c
index 56a4bd7..fa75999 100644
--- a/test/test_balance_type3.c
+++ b/test/test_balance_type3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_brick2.c b/test/test_brick2.c
index 3b0fdc8..277e0c5 100644
--- a/test/test_brick2.c
+++ b/test/test_brick2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_brick3.c b/test/test_brick3.c
index 8cfb042..233d1dc 100644
--- a/test/test_brick3.c
+++ b/test/test_brick3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_coarsen2.c b/test/test_coarsen2.c
index ab4b5f7..313587b 100644
--- a/test/test_coarsen2.c
+++ b/test/test_coarsen2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_coarsen3.c b/test/test_coarsen3.c
index 98fef9f..512ab47 100644
--- a/test/test_coarsen3.c
+++ b/test/test_coarsen3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_comm.c b/test/test_comm.c
index bcbf2b0..ba07f55 100644
--- a/test/test_comm.c
+++ b/test/test_comm.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_complete_subtree.c b/test/test_complete_subtree.c
new file mode 100644
index 0000000..98ae96d
--- /dev/null
+++ b/test/test_complete_subtree.c
@@ -0,0 +1,165 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef P4_TO_P8
+#include <p4est_algorithms.h>
+#include <p4est_bits.h>
+#include <p4est_communication.h>
+#include <p4est_extended.h>
+#else
+#include <p8est_algorithms.h>
+#include <p8est_bits.h>
+#include <p8est_communication.h>
+#include <p8est_extended.h>
+#endif
+
+typedef struct
+{
+  int                 maxlevel;
+  int                 counter;
+  int                 wrapper;
+}
+test_build_t;
+
+static int
+test_build_refine (p4est_t * p4est, p4est_topidx_t which_tree,
+                   p4est_quadrant_t * quadrant)
+{
+  test_build_t       *tb;
+
+  tb = (test_build_t *) p4est->user_pointer;
+
+  if (quadrant->level >= tb->maxlevel) {
+    return 0;
+  }
+  return !(tb->counter = (tb->counter + 1) % tb->wrapper);
+}
+
+/* specify in which tree we add which quadrant at all */
+static p4est_topidx_t local_id[2][5] = {
+  {2, 48, 94, -1, -1},
+  {-1, -1, 2, 25, 71}
+};
+
+static void
+test_build_local (sc_MPI_Comm mpicomm)
+{
+  p4est_topidx_t      treeid;
+  p4est_locidx_t      lid;
+  p4est_locidx_t      correct;
+  p4est_connectivity_t *conn;
+  p4est_t            *p4est, *built;
+  p4est_tree_t       *ptree;
+  p4est_tree_t        stree, *subtree = &stree;
+  p4est_quadrant_t   *quadrant;
+  test_build_t        stb, *tb = &stb;
+
+  /* 0. prepare data that we will reuse */
+  tb->maxlevel = 7 - P4EST_DIM;
+  tb->counter = -1;
+  tb->wrapper = 3;
+#ifndef P4_TO_P8
+  conn = p4est_connectivity_new_moebius ();
+#else
+  SC_ABORT_NOT_REACHED ();
+  conn = p8est_connectivity_new_rotcubes ();
+#endif /* P4_TO_P8 */
+  p4est = p4est_new_ext (mpicomm, conn, 0, 0, 0, 0, NULL, tb);
+  p4est_refine (p4est, 1, test_build_refine, NULL);
+  p4est_partition (p4est, 0, NULL);
+
+  /* Create a minimal p4est to enable the use of complete_subtree */
+  built = p4est_copy (p4est, 0);
+  correct = 0;
+  for (treeid = 0; treeid < 5; ++treeid) {
+    /* grab subtree to keep current as much as possible */
+    subtree = p4est_tree_array_index (built->trees, treeid);
+    subtree->quadrants_offset += correct;
+
+    /* now construct one quadrant to challenge complete_subtree */
+    lid = -1;
+    if (p4est->mpisize <= 2 && conn->num_trees <= 5) {
+      lid = local_id[p4est->mpirank][treeid];
+    }
+    if (lid >= 0) {
+      ptree = p4est_tree_array_index (p4est->trees, treeid);
+      correct -= (p4est_locidx_t) ptree->quadrants.elem_count;
+
+      /* we leak the quadrant's user data but there is none allocated */
+      sc_array_resize (&subtree->quadrants, 0);
+      memset (subtree->quadrants_per_level, 0,
+              sizeof (p4est_locidx_t) * P4EST_MAXLEVEL);
+      subtree->quadrants_per_level[P4EST_MAXLEVEL] = -1;
+      subtree->maxlevel = 0;
+
+      /* use lid to construct the prescribed quadrant */
+      quadrant = p4est_quadrant_array_index
+        (&ptree->quadrants, lid - ptree->quadrants_offset);
+      P4EST_ASSERT (p4est_quadrant_is_valid (quadrant));
+
+      /* add to emptied subtree */
+      *(p4est_quadrant_t *) sc_array_push (&subtree->quadrants) = *quadrant;
+      subtree->quadrants_per_level[quadrant->level] = 1;
+      subtree->maxlevel = quadrant->level;
+
+      /* and call the offending subtree routine */
+      p4est_complete_subtree (built, treeid, NULL);
+      correct += (p4est_locidx_t) subtree->quadrants.elem_count;
+    }
+  }
+  built->local_num_quadrants += correct;
+  p4est_comm_count_quadrants (built);
+  P4EST_ASSERT (p4est_is_valid (built));
+
+  /* clean up */
+  p4est_destroy (built);
+  p4est_destroy (p4est);
+  p4est_connectivity_destroy (conn);
+}
+
+int
+main (int argc, char **argv)
+{
+  sc_MPI_Comm         mpicomm;
+  int                 mpiret;
+
+  /* Initialize MPI */
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+  mpicomm = sc_MPI_COMM_WORLD;
+
+  /* Initialize packages */
+  sc_init (mpicomm, 1, 1, NULL, SC_LP_DEFAULT);
+  p4est_init (NULL, SC_LP_DEFAULT);
+
+  /* Test complete_subtree */
+  test_build_local (mpicomm);
+
+  /* Finalize */
+  sc_finalize ();
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+
+  return 0;
+}
diff --git a/test/test_conn_complete2.c b/test/test_conn_complete2.c
index 0a961be..fd25da1 100644
--- a/test/test_conn_complete2.c
+++ b/test/test_conn_complete2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_conn_complete3.c b/test/test_conn_complete3.c
index 46ca1eb..7c61a06 100644
--- a/test/test_conn_complete3.c
+++ b/test/test_conn_complete3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_conn_reduce2.c b/test/test_conn_reduce2.c
index 0ecb3c7..3d866ce 100644
--- a/test/test_conn_reduce2.c
+++ b/test/test_conn_reduce2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_conn_reduce3.c b/test/test_conn_reduce3.c
index 7d2bff4..c038fa8 100644
--- a/test/test_conn_reduce3.c
+++ b/test/test_conn_reduce3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_conn_transformation2.c b/test/test_conn_transformation2.c
new file mode 100644
index 0000000..0ec11d3
--- /dev/null
+++ b/test/test_conn_transformation2.c
@@ -0,0 +1,218 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef P4_TO_P8
+#include <p4est_connectivity.h>
+#else /* !P4_TO_P8 */
+#include <p8est_connectivity.h>
+#endif /* !P4_TO_P8 */
+
+/** Checks that orientation is properly set, i.e. face corner 0 of the
+ * face with the lower face index is touching face corner \a
+ * orientation of the face with the higher face index.
+ * \param[in] conn        p4est_connectivity structure two trees
+ * \param[in] l_face      left face index
+ * \param[in] r_face      right face index
+ * \param[in] orientation the orientation that has been set
+ */
+static int
+test_conn_transformation_check_orientation (p4est_connectivity_t * conn,
+                                            int l_face, int r_face,
+                                            int orientation)
+{
+  int                 neighboring_face_corner, corner_index;
+  int                 lowerFaceIndex, higherFaceIndex;
+  if (l_face <= r_face) {
+    lowerFaceIndex = l_face;
+    higherFaceIndex = r_face;
+  }
+  else {
+    lowerFaceIndex = r_face;
+    higherFaceIndex = l_face;
+  }
+
+  corner_index = p4est_face_corners[lowerFaceIndex][0];
+  neighboring_face_corner =
+    p4est_connectivity_face_neighbor_corner_orientation (corner_index,
+                                                         lowerFaceIndex,
+                                                         higherFaceIndex,
+                                                         orientation);
+
+  neighboring_face_corner =
+    p4est_corner_face_corners[neighboring_face_corner][higherFaceIndex];
+  P4EST_ASSERT (neighboring_face_corner == orientation);
+
+  return 0;
+}
+
+/** Checks for each face corner if the corner indices match on both
+ * sides.
+ * Let face corner fci, corresponding to corner index ci, be
+ * adjacent to face corner fcj, corresponding to corner index cj. We
+ * test if fci is seen from fcj and vice versa.
+ * \param [in] conn        p4est_connectivity structure two trees
+ * \param [in] l_face      left face index
+ * \param [in] r_face      right face index
+ * \param [in] orientation the orientation that has been set
+ */
+static int
+test_conn_transformation_check_face_corners (p4est_connectivity_t * conn,
+                                             int l_face, int r_face,
+                                             int orientation)
+{
+  int                 c0, c1, cx;
+  int                 i;
+  int                 lowerFaceIndex, higherFaceIndex;
+
+  /* swap face indices if necessary */
+  if (l_face <= r_face) {
+    lowerFaceIndex = l_face;
+    higherFaceIndex = r_face;
+  }
+  else {
+    lowerFaceIndex = r_face;
+    higherFaceIndex = l_face;
+  }
+
+  /* verify bijectivity of transformation */
+  for (i = 0; i < P4EST_HALF; ++i) {
+    c0 = p4est_face_corners[lowerFaceIndex][i];
+    c1 =
+      p4est_connectivity_face_neighbor_corner_orientation (c0, lowerFaceIndex,
+                                                           higherFaceIndex,
+                                                           orientation);
+    cx =
+      p4est_connectivity_face_neighbor_corner_orientation (c1,
+                                                           higherFaceIndex,
+                                                           lowerFaceIndex,
+                                                           orientation);
+
+    P4EST_ASSERT (c0 == cx);
+  }
+
+  return 0;
+}
+
+#ifdef P4_TO_P8
+/** Checks for each face edge if the edge indices match on both
+ * sides.
+ * Let face edge fei, corresponding to edge index ei, be
+ * adjacent to face edge fej, corresponding to edge index ej. We
+ * test if fei is seen from fej and vice versa.
+ * \param [in] conn        p4est_connectivity structure two trees
+ * \param [in] l_face      left face index
+ * \param [in] r_face      right face index
+ * \param [in] orientation the orientation that has been set
+ */
+static int
+test_conn_transformation_check_face_edges (p4est_connectivity_t * conn,
+                                           int l_face, int r_face,
+                                           int orientation)
+{
+  int                 e0, e1, ex;
+  int                 i;
+  int                 lowerFaceIndex, higherFaceIndex;
+
+  /* swap face indices if necessary */
+  if (l_face <= r_face) {
+    lowerFaceIndex = l_face;
+    higherFaceIndex = r_face;
+  }
+  else {
+    lowerFaceIndex = r_face;
+    higherFaceIndex = l_face;
+  }
+
+  /* verify bijectivity of transformation */
+  for (i = 0; i < P4EST_HALF; ++i) {
+    e0 = p8est_face_edges[lowerFaceIndex][i];
+    e1 =
+      p8est_connectivity_face_neighbor_edge_orientation (e0, lowerFaceIndex,
+                                                         higherFaceIndex,
+                                                         orientation);
+    ex =
+      p8est_connectivity_face_neighbor_edge_orientation (e1, higherFaceIndex,
+                                                         lowerFaceIndex,
+                                                         orientation);
+
+    P4EST_ASSERT (e0 == ex);
+  }
+
+  return 0;
+}
+#endif /* P4_TO_P8 */
+
+int
+main (int argc, char **argv)
+{
+  sc_MPI_Comm         mpicomm;
+  int                 mpiret;
+  int                 mpisize, mpirank;
+  p4est_connectivity_t *conn;
+
+  /* initialize MPI */
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+  mpicomm = sc_MPI_COMM_WORLD;
+  mpiret = sc_MPI_Comm_size (mpicomm, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  mpiret = sc_MPI_Comm_rank (mpicomm, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  /* initialize libsc and p4est */
+  sc_init (mpicomm, 1, 1, NULL, SC_LP_DEFAULT);
+  p4est_init (NULL, SC_LP_DEFAULT);
+
+  conn = 0;
+
+  int                 i, j;
+  int                 k;
+
+  for (i = 0; i < P4EST_FACES; ++i) {   /* set l_face */
+    for (j = 0; j < P4EST_FACES; ++j) { /* set r_face */
+      for (k = 0; k < P4EST_HALF; ++k) {        /* set orientation */
+        P4EST_ASSERT (conn == NULL);
+
+        /* create connectivity structure */
+        conn = p4est_connectivity_new_twotrees (i, j, k);
+
+        test_conn_transformation_check_orientation (conn, i, j, k);
+        test_conn_transformation_check_face_corners (conn, i, j, k);
+#ifdef P4_TO_P8
+        test_conn_transformation_check_face_edges (conn, i, j, k);
+#endif /* P4_TO_P8 */
+
+        p4est_connectivity_destroy (conn);
+        conn = 0;
+      }
+    }
+  }
+
+  /* exit */
+  sc_finalize ();
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+
+  return 0;
+}
diff --git a/example/timings/bricks3.c b/test/test_conn_transformation3.c
similarity index 91%
copy from example/timings/bricks3.c
copy to test/test_conn_transformation3.c
index 9e3e1e5..f5811f7 100644
--- a/example/timings/bricks3.c
+++ b/test/test_conn_transformation3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -22,4 +23,4 @@
 */
 
 #include <p4est_to_p8est.h>
-#include "bricks2.c"
+#include "test_conn_transformation2.c"
diff --git a/test/test_join2.c b/test/test_connrefine2.c
similarity index 68%
copy from test/test_join2.c
copy to test/test_connrefine2.c
index 55d5fcb..59759aa 100644
--- a/test/test_join2.c
+++ b/test/test_connrefine2.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,15 +24,18 @@
 
 #ifndef P4_TO_P8
 #include <p4est_connectivity.h>
+#include <p4est_vtk.h>
 #else
 #include <p8est_connectivity.h>
+#include <p8est_vtk.h>
 #endif
 
 int
 main (int argc, char **argv)
 {
   int                 mpiret;
-  p4est_connectivity_t *conn1, *conn2;
+  p4est_connectivity_t *conn_in, *conn_out;
+  p4est_t            *p4est;
 
   mpiret = sc_MPI_Init (&argc, &argv);
   SC_CHECK_MPI (mpiret);
@@ -40,24 +44,21 @@ main (int argc, char **argv)
   p4est_init (NULL, SC_LP_DEFAULT);
 
 #ifndef P4_TO_P8
-  conn1 = p4est_connectivity_new_unitsquare ();
-  conn2 = p4est_connectivity_new_rotwrap ();
+  conn_in = p4est_connectivity_new_cubed ();
 #else
-  conn1 = p8est_connectivity_new_unitcube ();
-  conn2 = p8est_connectivity_new_rotwrap ();
+  conn_in = p8est_connectivity_new_rotcubes ();
 #endif
+  conn_out = p4est_connectivity_refine (conn_in, 5);
+  p4est_connectivity_destroy (conn_in);
 
-  p4est_connectivity_join_faces (conn1, 0, 0, 0, 1, 0);
-  p4est_connectivity_join_faces (conn1, 0, 0, P4EST_FACES - 2,
-                                 P4EST_FACES - 1, 1);
+  p4est = p4est_new (sc_MPI_COMM_WORLD, conn_out, 0, NULL, NULL);
+  p4est_vtk_write_file (p4est, NULL, P4EST_STRING "_test_connrefine");
 
-  SC_CHECK_ABORT (p4est_connectivity_is_equivalent (conn1, conn2),
-                  "rotwrap not reproduced");
-
-  p4est_connectivity_destroy (conn1);
-  p4est_connectivity_destroy (conn2);
+  p4est_destroy (p4est);
+  p4est_connectivity_destroy (conn_out);
 
   sc_finalize ();
+
   mpiret = sc_MPI_Finalize ();
   SC_CHECK_MPI (mpiret);
   return 0;
diff --git a/example/timings/bricks3.c b/test/test_connrefine3.c
similarity index 92%
copy from example/timings/bricks3.c
copy to test/test_connrefine3.c
index 9e3e1e5..abd27cc 100644
--- a/example/timings/bricks3.c
+++ b/test/test_connrefine3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -22,4 +23,4 @@
 */
 
 #include <p4est_to_p8est.h>
-#include "bricks2.c"
+#include "test_connrefine2.c"
diff --git a/test/test_edge_face_corners3.c b/test/test_edge_face_corners3.c
index 721b234..72de6f4 100644
--- a/test/test_edge_face_corners3.c
+++ b/test/test_edge_face_corners3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_face_transform3.c b/test/test_face_transform3.c
index 3ceb228..82ce9e3 100644
--- a/test/test_face_transform3.c
+++ b/test/test_face_transform3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_ghost2.c b/test/test_ghost2.c
index 14060af..226f7e4 100644
--- a/test/test_ghost2.c
+++ b/test/test_ghost2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -87,6 +88,70 @@ typedef struct test_exchange
 }
 test_exchange_t;
 
+static p4est_ghost_exchange_t *
+test_exchange_begin (p4est_t * p4est, p4est_ghost_t * ghost)
+{
+  size_t              zz;
+  p4est_topidx_t      nt;
+  p4est_gloidx_t      gnum;
+  p4est_tree_t       *tree;
+  p4est_quadrant_t   *q;
+  void              **ghost_void_data;
+
+  /* Test begin/end: p4est data size is 0, transfer what's in the user_data void* */
+
+  p4est_reset_data (p4est, 0, NULL, NULL);
+  gnum = p4est->global_first_quadrant[p4est->mpirank];
+  for (nt = p4est->first_local_tree; nt <= p4est->last_local_tree; ++nt) {
+    tree = p4est_tree_array_index (p4est->trees, nt);
+    for (zz = 0; zz < tree->quadrants.elem_count; ++gnum, ++zz) {
+      q = p4est_quadrant_array_index (&tree->quadrants, zz);
+      q->p.user_long = (long) (3 * gnum + 17);
+    }
+  }
+  P4EST_ASSERT (gnum == p4est->global_first_quadrant[p4est->mpirank + 1]);
+
+  /* allocate data to receive the messages */
+  ghost_void_data = P4EST_ALLOC (void *, ghost->ghosts.elem_count);
+
+  /* the ghost data are safely packed into the return type */
+  return p4est_ghost_exchange_data_begin (p4est, ghost, ghost_void_data);
+}
+
+static void
+test_exchange_end (p4est_ghost_exchange_t * exc)
+{
+  void              **ghost_void_data = (void **) exc->ghost_data;
+  p4est_t            *p4est = exc->p4est;
+  p4est_ghost_t      *ghost = exc->ghost;
+  int                 p;
+  p4est_locidx_t      gexcl, gincl, gl;
+  p4est_gloidx_t      gnum;
+  p4est_quadrant_t   *q;
+
+  /* free this structure on the inside */
+  p4est_ghost_exchange_data_end (exc);
+
+  /* verify results of ghost exchange */
+  gexcl = 0;
+  for (p = 0; p < p4est->mpisize; ++p) {
+    gincl = ghost->proc_offsets[p + 1];
+    gnum = p4est->global_first_quadrant[p];
+#ifdef P4EST_TEST_CHATTY
+    P4EST_LDEBUGF ("In test begin/end for %d with %d %d\n", p, gexcl, gincl);
+#endif
+    for (gl = gexcl; gl < gincl; ++gl) {
+      q = p4est_quadrant_array_index (&ghost->ghosts, gl);
+      SC_CHECK_ABORT (3 * (gnum + (p4est_gloidx_t) q->p.piggy3.local_num) + 17
+                      == (p4est_gloidx_t) ghost_void_data[gl],
+                      "Ghost exchange mismatch begin/end");
+    }
+    gexcl = gincl;
+  }
+  P4EST_ASSERT (gexcl == (p4est_locidx_t) ghost->ghosts.elem_count);
+  P4EST_FREE (ghost_void_data);
+}
+
 static void
 test_exchange_A (p4est_t * p4est, p4est_ghost_t * ghost)
 {
@@ -317,6 +382,7 @@ main (int argc, char **argv)
   p4est_t            *p4est;
   p4est_connectivity_t *conn;
   p4est_ghost_t      *ghost;
+  p4est_ghost_exchange_t *exc;
   int                 num_cycles = 2;
   int                 i;
   p4est_lnodes_t     *lnodes;
@@ -383,10 +449,12 @@ main (int argc, char **argv)
     /* expand and test that the ghost layer can still exchange data properly
      * */
     p4est_ghost_expand_by_lnodes (p4est, lnodes, ghost);
+    exc = test_exchange_begin (p4est, ghost);
     test_exchange_A (p4est, ghost);
     test_exchange_B (p4est, ghost);
     test_exchange_C (p4est, ghost);
     test_exchange_D (p4est, ghost);
+    test_exchange_end (exc);
   }
 
   /* clean up */
diff --git a/test/test_ghost3.c b/test/test_ghost3.c
index cc95561..48c4f43 100644
--- a/test/test_ghost3.c
+++ b/test/test_ghost3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_hash.c b/test/test_hash.c
index f76b784..804be0f 100644
--- a/test/test_hash.c
+++ b/test/test_hash.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_iterate2.c b/test/test_iterate2.c
index 5adc130..03c58ed 100644
--- a/test/test_iterate2.c
+++ b/test/test_iterate2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -1047,5 +1048,3 @@ main (int argc, char **argv)
 
   return 0;
 }
-
-/* EOF test_iterate2.c */
diff --git a/test/test_iterate3.c b/test/test_iterate3.c
index f7c4b39..f30e3cc 100644
--- a/test/test_iterate3.c
+++ b/test/test_iterate3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,5 +24,3 @@
 
 #include <p4est_to_p8est.h>
 #include "test_iterate2.c"
-
-/* EOF test_iterate3.c */
diff --git a/test/test_join2.c b/test/test_join2.c
index 55d5fcb..f64f896 100644
--- a/test/test_join2.c
+++ b/test/test_join2.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -57,6 +58,17 @@ main (int argc, char **argv)
   p4est_connectivity_destroy (conn1);
   p4est_connectivity_destroy (conn2);
 
+#ifndef P4_TO_P8
+  conn1 = p4est_connectivity_new_unitsquare ();
+#else
+  conn1 = p8est_connectivity_new_unitcube ();
+#endif
+
+  p4est_connectivity_join_faces (conn1, 0, 0, 0, P4EST_FACES - 2, 0);
+  SC_CHECK_ABORT (p4est_connectivity_is_valid (conn1),
+                  "Created rotation around edge, but not valid");
+  p4est_connectivity_destroy (conn1);
+
   sc_finalize ();
   mpiret = sc_MPI_Finalize ();
   SC_CHECK_MPI (mpiret);
diff --git a/test/test_join3.c b/test/test_join3.c
index 13824ed..c3689ef 100644
--- a/test/test_join3.c
+++ b/test/test_join3.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2013 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_lnodes2.c b/test/test_lnodes2.c
index 94060ce..7c9fe0b 100644
--- a/test/test_lnodes2.c
+++ b/test/test_lnodes2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -980,5 +981,3 @@ main (int argc, char **argv)
 
   return 0;
 }
-
-/* EOF test_lnodes2.c */
diff --git a/test/test_lnodes3.c b/test/test_lnodes3.c
index 7643b8e..4d4a6c2 100644
--- a/test/test_lnodes3.c
+++ b/test/test_lnodes3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,5 +24,3 @@
 
 #include <p4est_to_p8est.h>
 #include "test_lnodes2.c"
-
-/* EOF test_lnodes3.c */
diff --git a/test/test_load2.c b/test/test_load2.c
new file mode 100644
index 0000000..9e685ed
--- /dev/null
+++ b/test/test_load2.c
@@ -0,0 +1,109 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef P4_TO_P8
+#include <p4est_algorithms.h>
+#include <p4est_bits.h>
+#include <p4est_communication.h>
+#include <p4est_extended.h>
+#include <p4est_io.h>
+#else
+#include <p8est_algorithms.h>
+#include <p8est_bits.h>
+#include <p8est_communication.h>
+#include <p8est_extended.h>
+#include <p8est_io.h>
+#endif
+#include <sc_options.h>
+#include <sc_statistics.h>
+
+#ifndef P4_TO_P8
+#define P4EST_CONN_SUFFIX "p4c"
+#define P4EST_FOREST_SUFFIX "p4p"
+#else
+#define P4EST_CONN_SUFFIX "p8c"
+#define P4EST_FOREST_SUFFIX "p8p"
+#endif
+
+int
+main (int argc, char **argv)
+{
+  sc_MPI_Comm         mpicomm;
+  int                 mpiret;
+  int                 mpirank;
+  int                 first_arg;
+  int                 autopartition;
+  int                 broadcasthead;
+  const char         *filename;
+  p4est_connectivity_t *connectivity;
+  p4est_t            *p4est;
+  sc_options_t       *opt;
+
+  /* initialize MPI */
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+  mpicomm = sc_MPI_COMM_WORLD;
+  mpiret = sc_MPI_Comm_rank (mpicomm, &mpirank);
+  SC_CHECK_MPI (mpiret);
+
+  /* initialize libsc and p4est */
+  sc_init (mpicomm, 1, 1, NULL, SC_LP_DEFAULT);
+  p4est_init (NULL, SC_LP_DEFAULT);
+
+  /* handle command line options */
+  opt = sc_options_new (argv[0]);
+  sc_options_add_string (opt, 'f', "file", &filename,
+                         NULL, "p4est data file to load");
+  sc_options_add_bool (opt, 'a', "autopartition", &autopartition,
+                       0, "Create a uniform partition when reading");
+  sc_options_add_bool (opt, 'b', "broadcasthead", &broadcasthead,
+                       0, "Broadcast header information when reading");
+  first_arg = sc_options_parse (p4est_package_id, SC_LP_INFO,
+                                opt, argc, argv);
+  SC_CHECK_ABORT (first_arg == argc, "Option error: no arguments required");
+  if (sc_is_root ()) {
+    sc_options_print_summary (p4est_package_id, SC_LP_PRODUCTION, opt);
+  }
+
+  /* only proceed if the filename is set */
+  if (filename != NULL) {
+    /* load the file */
+    p4est = p4est_load_ext (filename, mpicomm, 0, 0,
+                            autopartition, broadcasthead,
+                            NULL, &connectivity);
+
+    /* and destroy the forest read */
+    p4est_destroy (p4est);
+    p4est_connectivity_destroy (connectivity);
+  }
+
+  /* clean up and exit */
+  sc_options_destroy (opt);
+  sc_finalize ();
+
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+
+  return 0;
+}
diff --git a/example/timings/bricks3.c b/test/test_load3.c
similarity index 92%
copy from example/timings/bricks3.c
copy to test/test_load3.c
index 9e3e1e5..4d92d04 100644
--- a/example/timings/bricks3.c
+++ b/test/test_load3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -22,4 +23,4 @@
 */
 
 #include <p4est_to_p8est.h>
-#include "bricks2.c"
+#include "test_load2.c"
diff --git a/test/test_loadsave2.c b/test/test_loadsave2.c
index 15983ab..8a639ea 100644
--- a/test/test_loadsave2.c
+++ b/test/test_loadsave2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_loadsave3.c b/test/test_loadsave3.c
index 7a833e9..cf3b335 100644
--- a/test/test_loadsave3.c
+++ b/test/test_loadsave3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_order.c b/test/test_order.c
index f75df55..61823f3 100644
--- a/test/test_order.c
+++ b/test/test_order.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_partition2.c b/test/test_partition2.c
index c9a119c..27d0050 100644
--- a/test/test_partition2.c
+++ b/test/test_partition2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -21,14 +22,16 @@
   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
 
-#ifdef P4_TO_P8
-#include <p8est_algorithms.h>
-#include <p8est_communication.h>
-#include <p8est_extended.h>
-#else
+#ifndef P4_TO_P8
 #include <p4est_algorithms.h>
 #include <p4est_communication.h>
 #include <p4est_extended.h>
+#include <p4est_search.h>
+#else
+#include <p8est_algorithms.h>
+#include <p8est_communication.h>
+#include <p8est_extended.h>
+#include <p8est_search.h>
 #endif
 
 typedef struct
@@ -111,6 +114,7 @@ test_pertree (p4est_t * p4est, const p4est_gloidx_t * prev_pertree,
   const p4est_topidx_t num_trees = p4est->connectivity->num_trees;
   p4est_gloidx_t     *pertree;
 
+  /* test counting of quadrants in individual trees */
   P4EST_ASSERT ((size_t) num_trees == p4est->trees->elem_count);
   if (new_pertree == NULL) {
     pertree = P4EST_ALLOC (p4est_gloidx_t, num_trees + 1);
@@ -132,7 +136,8 @@ test_pertree (p4est_t * p4est, const p4est_gloidx_t * prev_pertree,
 }
 
 static void
-test_partition_circle (sc_MPI_Comm mpicomm, p4est_connectivity_t * connectivity,
+test_partition_circle (sc_MPI_Comm mpicomm,
+                       p4est_connectivity_t * connectivity,
                        p4est_gloidx_t * pertree1, p4est_gloidx_t * pertree2)
 {
   int                 i, j;
diff --git a/test/test_partition3.c b/test/test_partition3.c
index 84a8287..0be2957 100644
--- a/test/test_partition3.c
+++ b/test/test_partition3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_partition_corr2.c b/test/test_partition_corr2.c
index 0b4ce8c..2217f9e 100644
--- a/test/test_partition_corr2.c
+++ b/test/test_partition_corr2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_partition_corr3.c b/test/test_partition_corr3.c
index 7657e05..a1d2d89 100644
--- a/test/test_partition_corr3.c
+++ b/test/test_partition_corr3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_periodic3.c b/test/test_periodic3.c
index ab01d97..85d5e80 100644
--- a/test/test_periodic3.c
+++ b/test/test_periodic3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -196,8 +197,8 @@ weird_edges[2][2] = {{ 5, 7 }, { 7, 5 }};
 static void
 test_weird (void)
 {
-  const p4est_topidx_t num_edges = 1, num_ett = 2;
-  const p4est_topidx_t num_corners = 1, num_ctt = 4;
+  const p4est_topidx_t num_edges = 1, num_ett = 4;
+  const p4est_topidx_t num_corners = 1, num_ctt = 6;
   int                 i;
   size_t              zz;
   p8est_edge_info_t   ei;
@@ -223,8 +224,12 @@ test_weird (void)
   conn->tree_to_edge[7] = 0;
   conn->edge_to_tree[0] = 0;
   conn->edge_to_tree[1] = 0;
+  conn->edge_to_tree[2] = 0;
+  conn->edge_to_tree[3] = 0;
   conn->edge_to_edge[0] = 5;
-  conn->edge_to_edge[1] = 19;
+  conn->edge_to_edge[1] = 17;
+  conn->edge_to_edge[2] = 7;
+  conn->edge_to_edge[3] = 19;
   conn->ett_offset[0] = 0;
 
   for (i = 0; i < 8; ++i) {
@@ -232,16 +237,22 @@ test_weird (void)
   }
   conn->tree_to_corner[0] = 0;
   conn->tree_to_corner[1] = 0;
+  conn->tree_to_corner[3] = 0;
   conn->tree_to_corner[4] = 0;
   conn->tree_to_corner[5] = 0;
+  conn->tree_to_corner[7] = 0;
   conn->corner_to_tree[0] = 0;
   conn->corner_to_tree[1] = 0;
   conn->corner_to_tree[2] = 0;
   conn->corner_to_tree[3] = 0;
+  conn->corner_to_tree[4] = 0;
+  conn->corner_to_tree[5] = 0;
   conn->corner_to_corner[0] = 0;
   conn->corner_to_corner[1] = 1;
-  conn->corner_to_corner[2] = 4;
-  conn->corner_to_corner[3] = 5;
+  conn->corner_to_corner[2] = 3;
+  conn->corner_to_corner[3] = 4;
+  conn->corner_to_corner[4] = 5;
+  conn->corner_to_corner[5] = 7;
   conn->ctt_offset[0] = 0;
 
   P4EST_ASSERT (p8est_connectivity_is_valid (conn));
@@ -251,11 +262,12 @@ test_weird (void)
   for (i = 0; i < 2; ++i) {
     p8est_find_edge_transform (conn, 0, weird_edges[i][0], &ei);
     SC_CHECK_ABORT ((int) ei.iedge == weird_edges[i][0], "WE ei");
-    SC_CHECK_ABORT (eta->elem_count == 1, "WE count A");
+    SC_CHECK_ABORT (eta->elem_count == 2, "WE count A");
     for (zz = 0; zz < eta->elem_count; ++zz) {
       et = p8est_edge_array_index (eta, zz);
       SC_CHECK_ABORT (et->ntree == 0, "WE tree");
-      SC_CHECK_ABORT ((int) et->nedge == weird_edges[i][1], "WE edge");
+      SC_CHECK_ABORT ((int) et->nedge == weird_edges[i][1]
+                      || (int) et->nedge == weird_edges[i][0], "WE edge");
       SC_CHECK_ABORT (et->nflip == 1, "WE flip");
       SC_CHECK_ABORT (et->corners == et->nedge % 4, "WE corners");
       SC_CHECK_ABORT (et->naxis[0] == 1 && et->naxis[1] == 0 &&
@@ -269,11 +281,24 @@ test_weird (void)
   for (i = 0; i < 8; ++i) {
     p8est_find_corner_transform (conn, 0, i, &ci);
     SC_CHECK_ABORT ((int) ci.icorner == i, "WC ci");
-    SC_CHECK_ABORT ((int) cta->elem_count == 2 - (i & 0x02), "WC count");
-    for (zz = 0; zz < cta->elem_count; ++zz) {
-      ct = p8est_corner_array_index (cta, zz);
-      SC_CHECK_ABORT (ct->ntree == 0, "WC tree");
-      SC_CHECK_ABORT ((size_t) ct->ncorner == 4 * zz + !(i % 2), "WC corner");
+    if (i == 2 || i == 6) {
+      SC_CHECK_ABORT ((int) cta->elem_count == 0, "WC count");
+    }
+    else if (i == 0 || i == 4) {
+      SC_CHECK_ABORT ((int) cta->elem_count == 4, "WC count");
+      for (zz = 0; zz < cta->elem_count; ++zz) {
+        ct = p8est_corner_array_index (cta, zz);
+        SC_CHECK_ABORT (ct->ntree == 0, "WC tree");
+        SC_CHECK_ABORT ((size_t) ct->ncorner == 2 * zz + 1, "WC corner");
+      }
+    }
+    else {
+      SC_CHECK_ABORT ((int) cta->elem_count == 2, "WC count");
+      for (zz = 0; zz < cta->elem_count; ++zz) {
+        ct = p8est_corner_array_index (cta, zz);
+        SC_CHECK_ABORT (ct->ntree == 0, "WC tree");
+        SC_CHECK_ABORT ((size_t) ct->ncorner == 4 * zz, "WC corner");
+      }
     }
   }
   sc_array_reset (cta);
diff --git a/test/test_plex2.c b/test/test_plex2.c
index f91f056..303c89b 100644
--- a/test/test_plex2.c
+++ b/test/test_plex2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -157,41 +158,31 @@ refine_fn (p4est_t * p4est, p4est_topidx_t which_tree,
   return 1;
 }
 
-int
-main (int argc, char **argv)
+static int
+refine_tree_one_fn (p4est_t * p4est, p4est_topidx_t which_tree,
+                    p4est_quadrant_t * quadrant)
+{
+  return ! !which_tree;
+}
+
+static int
+test_forest (p4est_t * p4est, int overlap)
 {
   sc_MPI_Comm         mpicomm;
   int                 mpiret;
   int                 mpisize, mpirank;
-  p4est_t            *p4est;
-  p4est_connectivity_t *conn;
   sc_array_t         *points_per_dim, *cone_sizes, *cones,
     *cone_orientations, *coords,
     *children, *parents, *childids, *leaves, *remotes;
   p4est_locidx_t      first_local_quad = -1;
 
   /* initialize MPI */
-  mpiret = sc_MPI_Init (&argc, &argv);
-  SC_CHECK_MPI (mpiret);
-  mpicomm = sc_MPI_COMM_WORLD;
+  mpicomm = p4est->mpicomm;
   mpiret = sc_MPI_Comm_size (mpicomm, &mpisize);
   SC_CHECK_MPI (mpiret);
   mpiret = sc_MPI_Comm_rank (mpicomm, &mpirank);
   SC_CHECK_MPI (mpiret);
 
-  sc_init (mpicomm, 1, 1, NULL, SC_LP_DEFAULT);
-  p4est_init (NULL, SC_LP_DEFAULT);
-
-#ifndef P4_TO_P8
-  conn = p4est_connectivity_new_moebius ();
-#else
-  conn = p8est_connectivity_new_rotcubes ();
-#endif
-  p4est = p4est_new_ext (mpicomm, conn, 0, 1, 1, 0, NULL, NULL);
-  p4est_refine (p4est, 1, refine_fn, NULL);
-  p4est_balance (p4est, P4EST_CONNECT_FULL, NULL);
-  p4est_partition (p4est, 0, NULL);
-
   points_per_dim = sc_array_new (sizeof (p4est_locidx_t));
   cone_sizes = sc_array_new (sizeof (p4est_locidx_t));
   cones = sc_array_new (sizeof (p4est_locidx_t));
@@ -203,7 +194,7 @@ main (int argc, char **argv)
   leaves = sc_array_new (sizeof (p4est_locidx_t));
   remotes = sc_array_new (2 * sizeof (p4est_locidx_t));
 
-  p4est_get_plex_data (p4est, P4EST_CONNECT_FULL, (mpisize > 1) ? 2 : 0,
+  p4est_get_plex_data (p4est, P4EST_CONNECT_FULL, (mpisize > 1) ? overlap : 0,
                        &first_local_quad, points_per_dim, cone_sizes, cones,
                        cone_orientations, coords, children, parents, childids,
                        leaves, remotes);
@@ -282,6 +273,10 @@ main (int argc, char **argv)
                        (PetscInt *) leaves->array, PETSC_COPY_VALUES,
                        (PetscSFNode *) remotes->array, PETSC_COPY_VALUES);
     CHKERRQ (ierr);
+    ierr = DMSetPointSF (plex, pointSF);
+    CHKERRQ (ierr);
+    ierr = PetscSFDestroy (&pointSF);
+    CHKERRQ (ierr);
     ierr = DMViewFromOptions (plex, NULL, "-dm_view");
     CHKERRQ (ierr);
     /* TODO: test with rigid body modes as in plex ex3 */
@@ -304,9 +299,101 @@ main (int argc, char **argv)
   sc_array_destroy (leaves);
   sc_array_destroy (remotes);
 
+  return 0;
+}
+
+static int
+test_big (int argc, char **argv)
+{
+  sc_MPI_Comm         mpicomm;
+  int                 mpiret;
+  p4est_t            *p4est;
+  p4est_connectivity_t *conn;
+
+  /* initialize MPI */
+  mpicomm = sc_MPI_COMM_WORLD;
+
+#ifndef P4_TO_P8
+  conn = p4est_connectivity_new_moebius ();
+#else
+  conn = p8est_connectivity_new_rotcubes ();
+#endif
+  p4est = p4est_new_ext (mpicomm, conn, 0, 1, 1, 0, NULL, NULL);
+  p4est_refine (p4est, 1, refine_fn, NULL);
+  p4est_balance (p4est, P4EST_CONNECT_FULL, NULL);
+  p4est_partition (p4est, 0, NULL);
+
+  mpiret = test_forest (p4est, 2);
+  if (mpiret) {
+    return mpiret;
+  }
+
   p4est_destroy (p4est);
   p4est_connectivity_destroy (conn);
 
+  return 0;
+}
+
+static int
+test_small (int argc, char **argv)
+{
+  sc_MPI_Comm         mpicomm;
+  int                 mpiret;
+  p4est_t            *p4est;
+  p4est_connectivity_t *conn;
+
+  /* initialize MPI */
+  mpicomm = sc_MPI_COMM_WORLD;
+
+#ifndef P4_TO_P8
+  conn = p4est_connectivity_new_brick (2, 1, 0, 0);
+#else
+  conn = p8est_connectivity_new_brick (2, 1, 1, 0, 0, 0);
+#endif
+  p4est = p4est_new (mpicomm, conn, 0, NULL, NULL);
+  p4est_refine (p4est, 0, refine_tree_one_fn, NULL);
+
+  mpiret = test_forest (p4est, 0);
+  if (mpiret) {
+    return mpiret;
+  }
+
+  p4est_partition (p4est, 0, NULL);
+
+  mpiret = test_forest (p4est, 0);
+  if (mpiret) {
+    return mpiret;
+  }
+
+  p4est_destroy (p4est);
+  p4est_connectivity_destroy (conn);
+
+  return 0;
+}
+
+int
+main (int argc, char **argv)
+{
+  sc_MPI_Comm         mpicomm;
+  int                 mpiret;
+
+  /* initialize MPI */
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+  mpicomm = sc_MPI_COMM_WORLD;
+
+  sc_init (mpicomm, 1, 1, NULL, SC_LP_DEFAULT);
+  p4est_init (NULL, SC_LP_DEFAULT);
+
+  mpiret = test_small (argc, argv);
+  if (mpiret) {
+    return mpiret;
+  }
+  mpiret = test_big (argc, argv);
+  if (mpiret) {
+    return mpiret;
+  }
+
   sc_finalize ();
 
   mpiret = sc_MPI_Finalize ();
@@ -314,5 +401,3 @@ main (int argc, char **argv)
 
   return 0;
 }
-
-/* EOF test_plex2.c */
diff --git a/test/test_plex3.c b/test/test_plex3.c
index d4a4c4d..3855736 100644
--- a/test/test_plex3.c
+++ b/test/test_plex3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -23,5 +24,3 @@
 
 #include <p4est_to_p8est.h>
 #include "test_plex2.c"
-
-/* EOF test_plex3.c */
diff --git a/test/test_quadrants2.c b/test/test_quadrants2.c
index d24c6e5..1e2929c 100644
--- a/test/test_quadrants2.c
+++ b/test/test_quadrants2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_quadrants3.c b/test/test_quadrants3.c
index 25f4b50..0fe411e 100644
--- a/test/test_quadrants3.c
+++ b/test/test_quadrants3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_reorder2.c b/test/test_reorder2.c
index 67ee46a..44eb4a5 100644
--- a/test/test_reorder2.c
+++ b/test/test_reorder2.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_reorder3.c b/test/test_reorder3.c
index 9fa216a..021129e 100644
--- a/test/test_reorder3.c
+++ b/test/test_reorder3.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2011 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_replace2.c b/test/test_replace2.c
index 08bec8b..087b074 100644
--- a/test/test_replace2.c
+++ b/test/test_replace2.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_replace3.c b/test/test_replace3.c
index 2ef3bb6..8b8c0c1 100644
--- a/test/test_replace3.c
+++ b/test/test_replace3.c
@@ -3,7 +3,8 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 The University of Texas System
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_search2.c b/test/test_search2.c
index 9bdebe9..8b9aa68 100644
--- a/test/test_search2.c
+++ b/test/test_search2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -73,6 +74,28 @@ refine_fn (p4est_t * p4est, p4est_topidx_t which_tree,
 }
 
 static int
+count_callback (p4est_t * p4est, p4est_topidx_t which_tree,
+                p4est_quadrant_t * quadrant, p4est_locidx_t local_num,
+                void *point)
+{
+  P4EST_ASSERT (point == NULL);
+
+  if (local_num == -1) {
+    /* keep recursing to reach a leaf eventually */
+    return 1;
+  }
+  else {
+    p4est_locidx_t     *local_count = (p4est_locidx_t *) p4est->user_pointer;
+
+    /* return value shall be ignored for leaves */
+    P4EST_ASSERT (local_count != NULL);
+    SC_CHECK_ABORT (local_num == *local_count, "Count mismatch");
+    ++*local_count;
+    return 0;
+  }
+}
+
+static int
 search_callback (p4est_t * p4est, p4est_topidx_t which_tree,
                  p4est_quadrant_t * quadrant, p4est_locidx_t local_num,
                  void *point)
@@ -130,6 +153,7 @@ main (int argc, char **argv)
   int                 mpiret;
   int                 found_total;
   p4est_locidx_t      jt, Al, Bl;
+  p4est_locidx_t      local_count;
   p4est_connectivity_t *conn;
   p4est_quadrant_t   *A, *B;
   p4est_geometry_t   *geom;
@@ -157,7 +181,7 @@ main (int argc, char **argv)
   geom = p8est_geometry_new_sphere (conn, 1., 0.191728, 0.039856);
   vtkname = "test_search3";
 #endif
-  p4est = p4est_new_ext (mpicomm, conn, 0, 0, 0, 0, NULL, NULL);
+  p4est = p4est_new_ext (mpicomm, conn, 0, 0, 0, 0, NULL, &local_count);
   p4est_refine (p4est, 1, refine_fn, NULL);
   p4est_partition (p4est, 0, NULL);
   p4est_vtk_write_file (p4est, geom, vtkname);
@@ -215,6 +239,11 @@ main (int argc, char **argv)
   SC_CHECK_ABORT (A->p.piggy3.local_num == Al, "Search A");
   SC_CHECK_ABORT (B->p.piggy3.local_num == Bl, "Search B");
 
+  /* Use another search to count local quadrants */
+  local_count = 0;
+  p4est_search (p4est, count_callback, NULL, NULL);
+  SC_CHECK_ABORT (local_count == p4est->local_num_quadrants, "Count search");
+
   /* Clear memory */
   sc_array_destroy (points);
   p4est_destroy (p4est);
diff --git a/test/test_search3.c b/test/test_search3.c
index 6b7a160..c33bd17 100644
--- a/test/test_search3.c
+++ b/test/test_search3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_subcomm2.c b/test/test_subcomm2.c
new file mode 100644
index 0000000..a58a625
--- /dev/null
+++ b/test/test_subcomm2.c
@@ -0,0 +1,291 @@
+/*
+  This file is part of p4est.
+  p4est is a C library to manage a collection (a forest) of multiple
+  connected adaptive quadtrees or octrees in parallel.
+
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
+
+  p4est is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  p4est is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with p4est; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifdef P4_TO_P8
+#include <p8est_algorithms.h>
+#include <p8est_bits.h>
+#include <p8est_communication.h>
+#include <p8est_vtk.h>
+#else
+#include <p4est_algorithms.h>
+#include <p4est_bits.h>
+#include <p4est_communication.h>
+#include <p4est_vtk.h>
+#endif
+
+/**
+ * Runs all tests.
+ */
+int
+main (int argc, char **argv)
+{
+  const char         *this_fn_name = P4EST_STRING "_test_subcomm";
+  /* options */
+  const p4est_locidx_t min_quadrants = 15;
+  const int           min_level = 4;
+  const int           fill_uniform = 0;
+  /* parallel environment */
+  sc_MPI_Comm         mpicomm = sc_MPI_COMM_WORLD;
+  int                 mpisize, submpisize, rank;
+  int                 mpiret;
+  /* p4est */
+  p4est_connectivity_t *connectivity;
+  p4est_t            *p4est;
+  p4est_locidx_t     *partition;
+
+  /* initialize MPI */
+  mpiret = sc_MPI_Init (&argc, &argv);
+  SC_CHECK_MPI (mpiret);
+
+  /* exit if MPI communicator cannot be reduced */
+  mpiret = sc_MPI_Comm_size (mpicomm, &mpisize);
+  SC_CHECK_MPI (mpiret);
+  if (mpisize == 1) {
+    mpiret = sc_MPI_Finalize ();
+    SC_CHECK_MPI (mpiret);
+    return 0;
+  }
+
+  /* initialize p4est */
+  sc_init (mpicomm, 1, 1, NULL, SC_LP_DEFAULT);
+  p4est_init (NULL, SC_LP_DEFAULT);
+
+  /* create connectivity */
+#ifdef P4_TO_P8
+  connectivity = p8est_connectivity_new_unitcube ();
+#else
+  connectivity = p4est_connectivity_new_unitsquare ();
+#endif
+
+  /* create p4est object */
+  p4est = p4est_new_ext (mpicomm, connectivity,
+                         min_quadrants, min_level, fill_uniform,
+                         0, NULL, NULL);
+
+  /* write vtk: new */
+  p4est_vtk_write_file (p4est, NULL, P4EST_STRING "_subcomm_new");
+
+  /* set variables pertaining to the parallel environment */
+  rank = p4est->mpirank;
+  submpisize = mpisize / 2;
+  P4EST_ASSERT (submpisize <= p4est->global_num_quadrants);
+
+  /* construct partitioning with empty ranks */
+  {
+    p4est_locidx_t      n_quads_per_proc, n_quads_leftover;
+    int                 p;
+
+    partition = P4EST_ALLOC (p4est_locidx_t, mpisize);
+    n_quads_per_proc = p4est->global_num_quadrants / submpisize;
+    n_quads_leftover = p4est->global_num_quadrants -
+      (n_quads_per_proc * submpisize);
+    for (p = 0; p < mpisize; p++) {
+      if (p % 2) {              /* if this rank will get quadrants */
+        partition[p] = n_quads_per_proc;
+      }
+      else {                    /* if this rank will be empty */
+        partition[p] = 0;
+      }
+    }
+    partition[1] += n_quads_leftover;
+
+    /* check partitioning */
+#ifdef P4EST_ENABLE_DEBUG
+    {
+      p4est_gloidx_t      sum = 0;
+
+      for (p = 0; p < mpisize; p++) {
+        sum += (p4est_gloidx_t) partition[p];
+      }
+      P4EST_ASSERT (sum == p4est->global_num_quadrants);
+    }
+#endif
+  }
+
+  /*
+   * Test 1: Reduce MPI communicator to non-empty ranks
+   */
+
+  P4EST_GLOBAL_INFOF ("%s: Into test 1\n", this_fn_name);
+  {
+    p4est_t            *p4est_subcomm;
+    int                 is_nonempty;
+
+    /* create p4est copy and re-partition */
+    p4est_subcomm = p4est_copy_ext (p4est, 1, 1);
+    (void) p4est_partition_given (p4est_subcomm, partition);
+
+    /* write vtk: partitioned */
+    p4est_vtk_write_file (p4est_subcomm, NULL, P4EST_STRING "_subcomm_part");
+
+    /* reduce MPI communicator to non-empty ranks */
+    is_nonempty = p4est_comm_parallel_env_reduce (&p4est_subcomm);
+    P4EST_ASSERT ((is_nonempty && 0 < partition[rank]) ||
+                  (!is_nonempty && 0 == partition[rank]));
+
+    if (is_nonempty) {
+      /* write vtk: reduced communicator */
+      p4est_vtk_write_file (p4est_subcomm, NULL,
+                            P4EST_STRING "_subcomm_sub1");
+
+      /* destroy the p4est that has a reduced MPI communicator */
+      p4est_destroy (p4est_subcomm);
+    }
+  }
+  mpiret = sc_MPI_Barrier (mpicomm);
+  SC_CHECK_MPI (mpiret);
+  P4EST_GLOBAL_INFOF ("%s: Done test 1\n", this_fn_name);
+
+  /*
+   * Test 2: Reduce MPI communicator to non-empty ranks, but now the MPI
+   * communicator is not owned
+   */
+
+  P4EST_GLOBAL_INFOF ("%s: Into test 2\n", this_fn_name);
+  {
+    p4est_t            *p4est_subcomm;
+    int                 is_nonempty;
+
+    /* create p4est copy and re-partition */
+    p4est_subcomm = p4est_copy_ext (p4est, 1, 0 /* don't dup. comm. */ );
+    (void) p4est_partition_given (p4est_subcomm, partition);
+
+    /* reduce MPI communicator to non-empty ranks */
+    is_nonempty = p4est_comm_parallel_env_reduce (&p4est_subcomm);
+    P4EST_ASSERT ((is_nonempty && 0 < partition[rank]) ||
+                  (!is_nonempty && 0 == partition[rank]));
+
+    if (is_nonempty) {
+      /* destroy the p4est that has a reduced MPI communicator */
+      p4est_destroy (p4est_subcomm);
+    }
+  }
+  mpiret = sc_MPI_Barrier (mpicomm);
+  SC_CHECK_MPI (mpiret);
+  P4EST_GLOBAL_INFOF ("%s: Done test 2\n", this_fn_name);
+
+  /*
+   * Test 3: Reduce MPI communicator to non-empty ranks, but keep rank 0
+   */
+
+  P4EST_GLOBAL_INFOF ("%s: Into test 3\n", this_fn_name);
+  {
+    p4est_t            *p4est_subcomm;
+    int                 sub_exists;
+    sc_MPI_Group        group, group_reserve;
+    int                 reserve_range[1][3];
+
+    /* create group of full MPI communicator */
+    mpiret = sc_MPI_Comm_group (mpicomm, &group);
+    SC_CHECK_MPI (mpiret);
+
+    /* create sub-group containing only rank 0 */
+    reserve_range[0][0] = 0;
+    reserve_range[0][1] = 0;
+    reserve_range[0][2] = 1;
+    mpiret =
+      sc_MPI_Group_range_incl (group, 1, reserve_range, &group_reserve);
+    SC_CHECK_MPI (mpiret);
+
+    /* create p4est copy and re-partition */
+    p4est_subcomm = p4est_copy_ext (p4est, 1, 1);
+    (void) p4est_partition_given (p4est_subcomm, partition);
+
+    /* reduce MPI communicator to non-empty ranks, but keep rank 0 */
+    sub_exists = p4est_comm_parallel_env_reduce_ext (&p4est_subcomm,
+                                                     group_reserve, 1, NULL);
+    P4EST_ASSERT ((sub_exists && (0 < partition[rank] || rank == 0)) ||
+                  (!sub_exists && 0 == partition[rank]));
+
+    if (sub_exists) {
+      /* write vtk: reduced communicator */
+      p4est_vtk_write_file (p4est_subcomm, NULL,
+                            P4EST_STRING "_subcomm_sub3");
+
+      /* destroy the p4est that has a reduced MPI communicator */
+      p4est_destroy (p4est_subcomm);
+    }
+  }
+  mpiret = sc_MPI_Barrier (mpicomm);
+  SC_CHECK_MPI (mpiret);
+  P4EST_GLOBAL_INFOF ("%s: Done test 3\n", this_fn_name);
+
+  /*
+   * Test 4: Reduce MPI communicator to non-empty ranks, but keep last 2 ranks
+   */
+
+  P4EST_GLOBAL_INFOF ("%s: Into test 4\n", this_fn_name);
+  {
+    p4est_t            *p4est_subcomm;
+    int                 sub_exists;
+    sc_MPI_Group        group, group_reserve;
+    int                 reserve_range[1][3];
+
+    /* create group of full MPI communicator */
+    mpiret = sc_MPI_Comm_group (mpicomm, &group);
+    SC_CHECK_MPI (mpiret);
+
+    /* create sub-group containing only last 2 ranks */
+    reserve_range[0][0] = SC_MAX (0, mpisize - 2);
+    reserve_range[0][1] = mpisize - 1;
+    reserve_range[0][2] = 1;
+    mpiret =
+      sc_MPI_Group_range_incl (group, 1, reserve_range, &group_reserve);
+    SC_CHECK_MPI (mpiret);
+
+    /* create p4est copy and re-partition */
+    p4est_subcomm = p4est_copy_ext (p4est, 1, 1);
+    (void) p4est_partition_given (p4est_subcomm, partition);
+
+    /* reduce MPI communicator to non-empty ranks, but keep last 2 ranks */
+    sub_exists = p4est_comm_parallel_env_reduce_ext (&p4est_subcomm,
+                                                     group_reserve, 0, NULL);
+    P4EST_ASSERT ((sub_exists && (0 < partition[rank] || mpisize - 2 <= rank))
+                  || (!sub_exists && 0 == partition[rank]));
+
+    if (sub_exists) {
+      /* write vtk: reduced communicator */
+      p4est_vtk_write_file (p4est_subcomm, NULL,
+                            P4EST_STRING "_subcomm_sub4");
+
+      /* destroy the p4est that has a reduced MPI communicator */
+      p4est_destroy (p4est_subcomm);
+    }
+  }
+  mpiret = sc_MPI_Barrier (mpicomm);
+  SC_CHECK_MPI (mpiret);
+  P4EST_GLOBAL_INFOF ("%s: Done test 4\n", this_fn_name);
+
+  /* destroy */
+  P4EST_FREE (partition);
+  p4est_destroy (p4est);
+  p4est_connectivity_destroy (connectivity);
+
+  /* finalize */
+  sc_finalize ();
+  mpiret = sc_MPI_Finalize ();
+  SC_CHECK_MPI (mpiret);
+
+  return 0;
+}
diff --git a/example/timings/bricks3.c b/test/test_subcomm3.c
similarity index 92%
copy from example/timings/bricks3.c
copy to test/test_subcomm3.c
index 9e3e1e5..c645e1c 100644
--- a/example/timings/bricks3.c
+++ b/test/test_subcomm3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -22,4 +23,4 @@
 */
 
 #include <p4est_to_p8est.h>
-#include "bricks2.c"
+#include "test_subcomm2.c"
diff --git a/test/test_valid2.c b/test/test_valid2.c
index 1e7a960..611feb3 100644
--- a/test/test_valid2.c
+++ b/test/test_valid2.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
@@ -45,7 +46,7 @@ static const int    refine_level = 5;
 static const int    refine_level = 4;
 #endif
 
-static p4est_balance_type_t
+static              p4est_balance_type_t
 check_backward_compatibility (void)
 {
   p4est_balance_type_t b;
@@ -151,6 +152,31 @@ check_all (sc_MPI_Comm mpicomm, p4est_connectivity_t * conn,
   p4est_connectivity_destroy (conn);
 }
 
+static void
+check_int_types (void)
+{
+  p4est_qcoord_t      qco, qobs;
+  p4est_topidx_t      top, tobs;
+  p4est_locidx_t      loc, lobs;
+  p4est_gloidx_t      glo, gobs;
+
+  qco = P4EST_QCOORD_MAX;
+  qobs = P4EST_QCOORD_ABS (qco);
+  SC_CHECK_ABORT (qco == qobs, "Failed qcoord abs function");
+
+  top = P4EST_TOPIDX_MAX;
+  tobs = P4EST_TOPIDX_ABS (top);
+  SC_CHECK_ABORT (top == tobs, "Failed topidx abs function");
+
+  loc = P4EST_LOCIDX_MAX;
+  lobs = P4EST_LOCIDX_ABS (loc);
+  SC_CHECK_ABORT (loc == lobs, "Failed locidx abs function");
+
+  glo = P4EST_GLOIDX_MAX;
+  gobs = P4EST_GLOIDX_ABS (glo);
+  SC_CHECK_ABORT (glo == gobs, "Failed gloidx abs function");
+}
+
 int
 main (int argc, char **argv)
 {
@@ -170,6 +196,7 @@ main (int argc, char **argv)
   p4est_init (NULL, SC_LP_DEFAULT);
 
   (void) check_backward_compatibility ();
+  check_int_types ();
 
 #ifndef P4_TO_P8
   check_all (mpicomm, p4est_connectivity_new_unitsquare (),
diff --git a/test/test_valid3.c b/test/test_valid3.c
index 40290bb..293f0bb 100644
--- a/test/test_valid3.c
+++ b/test/test_valid3.c
@@ -4,6 +4,7 @@
   connected adaptive quadtrees or octrees in parallel.
 
   Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
   Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
diff --git a/test/test_wrap2.c b/test/test_wrap2.c
index 36e45d6..0d6088f 100644
--- a/test/test_wrap2.c
+++ b/test/test_wrap2.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -29,8 +31,15 @@
 static int
 wrap_adapt_partition (p4est_wrap_t * wrap, int weight_exponent)
 {
+  p4est_locidx_t      uf, ul;
+
   if (p4est_wrap_adapt (wrap)) {
-    if (p4est_wrap_partition (wrap, weight_exponent)) {
+    if (p4est_wrap_partition (wrap, weight_exponent, &uf, &ul, NULL)) {
+
+      SC_CHECK_ABORT (uf >= 0 && ul >= 0, "Invalid post window");
+      SC_CHECK_ABORT (uf + ul <= wrap->p4est->local_num_quadrants,
+                      "Invalid post count");
+
       p4est_wrap_complete (wrap);
     }
     return 1;
@@ -39,6 +48,40 @@ wrap_adapt_partition (p4est_wrap_t * wrap, int weight_exponent)
   return 0;
 }
 
+static void
+test_coarsen_delay (p4est_wrap_t * wrap)
+{
+  p4est_locidx_t      jl;
+  p4est_wrap_leaf_t  *leaf;
+  p4est_wrap_t       *copy1, *copy2;
+
+  p4est_wrap_set_coarsen_delay (wrap, 2, 0);
+
+  for (jl = 0, leaf = p4est_wrap_leaf_first (wrap, 1); leaf != NULL;
+       jl++, leaf = p4est_wrap_leaf_next (leaf)) {
+    if (leaf->which_quad % 4 == 0) {
+      p4est_wrap_mark_refine (wrap, leaf->which_tree, leaf->which_quad);
+    }
+  }
+
+  copy1 = p4est_wrap_new_copy (wrap, 17, NULL, NULL);
+  copy2 = p4est_wrap_new_copy (copy1, 0, NULL, NULL);
+
+  wrap_adapt_partition (wrap, 1);
+
+  /* copies must be destroyed before the original is destroyed, in any order */
+  p4est_wrap_destroy (copy1);
+
+  for (jl = 0, leaf = p4est_wrap_leaf_first (wrap, 1); leaf != NULL;
+       jl++, leaf = p4est_wrap_leaf_next (leaf)) {
+    p4est_wrap_mark_coarsen (wrap, leaf->which_tree, leaf->which_quad);
+  }
+  wrap_adapt_partition (wrap, 1);
+
+  /* copies must be destroyed before the original is destroyed, anywhere */
+  p4est_wrap_destroy (copy2);
+}
+
 int
 main (int argc, char **argv)
 {
@@ -50,6 +93,7 @@ main (int argc, char **argv)
 #else
   int                 lp = SC_LP_PRODUCTION;
 #endif
+  p4est_topidx_t      treecount;
   p4est_locidx_t      jl;
   p4est_wrap_leaf_t  *leaf;
   p4est_ghost_t      *ghost;
@@ -78,7 +122,7 @@ main (int argc, char **argv)
 
   for (loop = 0; loop < 3; ++loop) {
     /* mark for refinement */
-    for (jl = 0, leaf = p4est_wrap_leaf_first (wrap); leaf != NULL;
+    for (jl = 0, leaf = p4est_wrap_leaf_first (wrap, 1); leaf != NULL;
          jl++, leaf = p4est_wrap_leaf_next (leaf)) {
       if (leaf->which_quad % 3 == 0) {
         p4est_wrap_mark_refine (wrap, leaf->which_tree, leaf->which_quad);
@@ -92,22 +136,29 @@ main (int argc, char **argv)
 
   for (loop = 0; loop < 2; ++loop) {
     /* mark some elements for coarsening that does not effect anything */
-    for (jl = 0, leaf = p4est_wrap_leaf_first (wrap); leaf != NULL;
+    treecount = 0;
+    for (jl = 0, leaf = p4est_wrap_leaf_first (wrap, 0); leaf != NULL;
          jl++, leaf = p4est_wrap_leaf_next (leaf)) {
+      if (P4EST_LEAF_IS_FIRST_IN_TREE (leaf)) {
+        ++treecount;
+      }
       if (leaf->which_quad % 5 == 0) {
         p4est_wrap_mark_refine (wrap, leaf->which_tree, leaf->which_quad);
         p4est_wrap_mark_coarsen (wrap, leaf->which_tree, leaf->which_quad);
       }
     }
     SC_CHECK_ABORT (jl == wrap->p4est->local_num_quadrants, "Iterator");
+    /* this test should also be fine with empty processors */
+    SC_CHECK_ABORT (treecount == wrap->p4est->last_local_tree -
+                    wrap->p4est->first_local_tree + 1, "Iterator");
 
     changed = wrap_adapt_partition (wrap, 0);
     SC_CHECK_ABORT (!changed, "Wrap noop");
   }
-  
+
   for (loop = 0; loop < 2; ++loop) {
     /* mark for coarsening */
-    for (jl = 0, leaf = p4est_wrap_leaf_first (wrap); leaf != NULL;
+    for (jl = 0, leaf = p4est_wrap_leaf_first (wrap, 1); leaf != NULL;
          jl++, leaf = p4est_wrap_leaf_next (leaf)) {
       if ((leaf->which_quad / 13) % 17 != 3) {
         p4est_wrap_mark_coarsen (wrap, leaf->which_tree, leaf->which_quad);
@@ -118,6 +169,8 @@ main (int argc, char **argv)
     (void) wrap_adapt_partition (wrap, 0);
   }
 
+  test_coarsen_delay (wrap);
+
   p4est_wrap_destroy (wrap);
 
   sc_finalize ();
diff --git a/test/test_wrap3.c b/test/test_wrap3.c
index c2e1816..a57acec 100644
--- a/test/test_wrap3.c
+++ b/test/test_wrap3.c
@@ -3,7 +3,9 @@
   p4est is a C library to manage a collection (a forest) of multiple
   connected adaptive quadtrees or octrees in parallel.
 
-  Copyright (C) 2012 Carsten Burstedde
+  Copyright (C) 2010 The University of Texas System
+  Additional copyright (C) 2011 individual authors
+  Written by Carsten Burstedde, Lucas C. Wilcox, and Tobin Isaac
 
   p4est is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/p4est.git



More information about the debian-science-commits mailing list